In [1]:
import pandas as pd
import numpy as np
import os
path = 'C:/Users/dubrangala/OneDrive - VMware, Inc/Case Studies/hackerarth_prediction/dataset'
os.chdir(path)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
## Read Master Data
df = pd.read_pickle('./processed_if_hbos_pca_cblof_vae_lof_bags1.pkl')
pred_genetic_disorder = pd.read_csv("pred_genetic_disorder_phase1.csv")

In [3]:
df.myflag.value_counts()

train    21805
test      9465
Name: myflag, dtype: int64

In [4]:
df[df.myflag=='train'].genetic_disorder_v1.value_counts()

Mitochondrial genetic inheritance disorders     11174
Single-gene inheritance diseases                 8371
Multifactorial genetic inheritance disorders     2260
Name: genetic_disorder_v1, dtype: int64

In [5]:
df[df.myflag=='train'].disorder_subclass.value_counts()

Leigh syndrome                         5160
Mitochondrial myopathy                 4405
Cystic fibrosis                        3448
Tay-Sachs                              2833
Diabetes                               1817
Hemochromatosis                        1355
Leber's hereditary optic neuropathy     648
Alzheimer's                             152
Cancer                                   97
Name: disorder_subclass, dtype: int64

In [7]:
df_train = df[df.myflag=='train'].copy()
print("train",df_train.shape)
df_valid = df[df.myflag!='train'].copy()
print("valid",df_valid.shape)
df_train = df_train[~df_train.disorder_subclass.isnull()].copy()
print("train after",df_train.shape)

train (21805, 83)
valid (9465, 83)
train after (19915, 83)


In [8]:
pred_genetic_disorder[['patient_id','pred_multi']].head()

Unnamed: 0,patient_id,pred_multi
0,PID0x4175,Multifactorial genetic inheritance disorders
1,PID0x21f5,Mitochondrial genetic inheritance disorders
2,PID0x49b8,Mitochondrial genetic inheritance disorders
3,PID0x2d97,Mitochondrial genetic inheritance disorders
4,PID0x58da,Multifactorial genetic inheritance disorders


In [9]:
pred_genetic_disorder.pred_multi.value_counts()

Mitochondrial genetic inheritance disorders     5712
Multifactorial genetic inheritance disorders    3238
Single-gene inheritance diseases                 515
Name: pred_multi, dtype: int64

In [10]:
df_valid = df_valid.merge(pred_genetic_disorder[['patient_id','pred_multi']], on='patient_id', how='inner')
df_valid.shape

(9465, 84)

In [11]:
df_valid.pred_multi.value_counts()

Mitochondrial genetic inheritance disorders     5712
Multifactorial genetic inheritance disorders    3238
Single-gene inheritance diseases                 515
Name: pred_multi, dtype: int64

### Model coding - Functions

In [2]:
#import libraries
import lightgbm as lgbm
import pandas as pd
import numpy as np
import os
import datetime as dt
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import MinMaxScaler
randSeed=17

In [13]:
df_train[df_train.myflag=='train'].disorder_subclass.unique()

array(["Leber's hereditary optic neuropathy", 'Cystic fibrosis',
       'Diabetes', 'Leigh syndrome', 'Cancer', 'Tay-Sachs',
       'Hemochromatosis', 'Mitochondrial myopathy', "Alzheimer's"],
      dtype=object)

In [66]:
# Code for converting target string lables to integer and vice versa
class_encoding = {"Leber's hereditary optic neuropathy":0, 
                  'Cystic fibrosis':1,
                  'Diabetes':2, 
                  'Leigh syndrome':3, 
                  'Cancer':4, 
                  'Tay-Sachs':5,
                  'Hemochromatosis':6,
                  'Mitochondrial myopathy':7, 
                  "Alzheimer's":8}
class_decoding = {0:"Leber's hereditary optic neuropathy", 
                  1:'Cystic fibrosis',
                  2:'Diabetes', 
                  3:'Leigh syndrome', 
                  4:'Cancer', 
                  5:'Tay-Sachs',
                  6:'Hemochromatosis',
                  7:'Mitochondrial myopathy', 
                  8:"Alzheimer's"}

In [14]:
df_train['target'] = df_train['disorder_subclass'].apply(lambda x:class_encoding[x])
df_valid['target'] = 99
df_train.target.value_counts()

3    5160
7    4405
1    3448
5    2833
2    1817
6    1355
0     648
8     152
4      97
Name: target, dtype: int64

In [17]:
## Create Dummy from phase 1 predictors
df_train['genetic_disorder_1'] = np.where(df_train['genetic_disorder_v1']=='Mitochondrial genetic inheritance disorders', True,False)
df_train['genetic_disorder_2'] = np.where(df_train['genetic_disorder_v1']=='Single-gene inheritance diseases', True,False)
df_train['genetic_disorder_3'] = np.where(df_train['genetic_disorder_v1']=='Multifactorial genetic inheritance disorders', True,False)

df_valid['genetic_disorder_1'] = np.where(df_valid['pred_multi']=='Mitochondrial genetic inheritance disorders', True,False)
df_valid['genetic_disorder_2'] = np.where(df_valid['pred_multi']=='Single-gene inheritance diseases', True,False)
df_valid['genetic_disorder_3'] = np.where(df_valid['pred_multi']=='Multifactorial genetic inheritance disorders', True,False)


In [18]:
df_valid.head()

Unnamed: 0,patient_id,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter,blood_cell_count_mcl,test_1,test_2,test_3,test_4,test_5,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,myflag,genetic_disorder,disorder_subclass,genetic_disorder_old,disorder_subclass_v1,genetic_disorder_v1,target,flag_train,inherited_from_father_No,inherited_from_father_Yes,maternal_gene_No,maternal_gene_Yes,respiratory_rate_breathsmin_Normal (30-60),respiratory_rate_breathsmin_Tachypnea,heart_rate_ratesmin_Normal,heart_rate_ratesmin_Tachycardia,gender_Ambiguous,gender_Female,gender_Male,birth_asphyxia_No,birth_asphyxia_Not available,birth_asphyxia_Yes,autopsy_shows_birth_defect_if_applicable_No,autopsy_shows_birth_defect_if_applicable_Not applicable,autopsy_shows_birth_defect_if_applicable_Yes,place_of_birth_Home,place_of_birth_Institute,place_of_birth_Not available,folic_acid_details_periconceptional_No,folic_acid_details_periconceptional_Yes,ho_serious_maternal_illness_No,ho_serious_maternal_illness_Yes,ho_radiation_exposure_xray_No,ho_radiation_exposure_xray_Not applicable,ho_radiation_exposure_xray_Yes,ho_substance_abuse_No,ho_substance_abuse_Not applicable,ho_substance_abuse_Yes,assisted_conception_ivfart_No,assisted_conception_ivfart_Yes,history_of_anomalies_in_previous_pregnancies_No,history_of_anomalies_in_previous_pregnancies_Yes,birth_defects_Multiple,birth_defects_Singular,blood_test_result_abnormal,blood_test_result_inconclusive,blood_test_result_normal,blood_test_result_slightly abnormal,genes_in_mothers_side_No,genes_in_mothers_side_Yes,paternal_gene_No,paternal_gene_Yes,status_Alive,status_Deceased,parental_consent_No,parental_consent_Yes,followup_High,followup_Low,followup_Missing,IF_score,HBOS_score,PCA_score,CBLOF_score,VAE_score,FeatureBagging_HBOS_score,FeatureBagging_PCA_score,pred_multi,genetic_disorder_1,genetic_disorder_2,genetic_disorder_3
0,PID0x4175,6.0,38.0,61.0,2.0,-99.0,4.981655,0.0,-99.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,test,,,,,,99,0,False,True,True,False,False,True,True,False,False,False,True,False,False,True,False,True,False,False,True,False,False,True,True,False,False,False,True,False,True,False,True,False,False,True,True,False,False,False,False,True,True,False,True,False,True,False,True,False,False,True,False,1.028612,-278.457619,39764.514416,158.376271,574.322516,-169.976039,77645.803467,Multifactorial genetic inheritance disorders,False,False,True
1,PID0x21f5,10.0,33.0,53.0,1.984923,8.179584,5.11889,0.0,0.0,-99.0,1.0,-99.0,0.0,0.0,0.0,1.0,0.0,test,,,,,,99,0,True,False,False,True,False,True,False,True,False,False,True,False,False,True,False,True,False,False,False,True,False,True,True,False,False,False,True,False,False,True,True,False,False,True,True,False,False,False,True,False,False,True,False,True,True,False,False,True,False,True,False,1.009372,-273.760718,38843.991637,154.718793,476.964034,-166.889813,78003.33081,Mitochondrial genetic inheritance disorders,True,False,False
2,PID0x49b8,5.0,48.0,60.0,0.0,-99.0,4.876204,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,test,,,,,,99,0,True,False,True,False,False,True,True,False,True,False,False,False,True,False,False,False,True,False,True,False,True,False,False,True,False,False,True,False,False,True,False,True,True,False,False,True,False,False,False,True,True,False,True,False,False,True,True,False,False,True,False,1.02761,-282.07904,31679.703804,125.807391,503.865666,-171.957654,64534.909557,Mitochondrial genetic inheritance disorders,True,False,False
3,PID0x2d97,13.0,25.0,55.0,1.996537,6.884071,4.687767,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,test,,,,,,99,0,False,True,False,True,False,True,True,False,True,False,False,True,False,False,False,True,False,False,False,True,False,True,False,True,False,False,True,True,False,False,True,False,False,True,False,True,False,False,True,False,True,False,True,False,True,False,True,False,False,True,False,1.022152,-276.410784,15786.720762,61.632862,231.328457,-168.874856,26131.862884,Mitochondrial genetic inheritance disorders,True,False,False
4,PID0x58da,5.0,41.0,38.0,2.009769,6.195178,5.152362,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,test,,,,,,99,0,False,True,False,True,False,True,False,True,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,True,False,True,False,True,False,True,False,True,False,False,False,True,False,True,False,False,True,False,True,False,True,False,True,False,0.985659,-283.648972,14367.28736,55.987886,211.247121,-174.934528,27819.026146,Multifactorial genetic inheritance disorders,False,False,True


In [17]:
def parameterSpace(selected_classfier='xgboost'):  
    
    # XGBOOST earlier exploration
    if(selected_classfier=='xgboost'):
        space = {
        'n_estimators': hp.quniform('n_estimators', 50, 800, 20),
        'max_depth':  hp.choice('max_depth', np.arange(5, 25, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 2, 30, 1),
        'subsample': hp.quniform('subsample', 0.1, 0.9, 0.005),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.09, 0.9, 0.05),
        'colsample_bylevel': hp.quniform('colsample_bylevel', 0.1, 1.0, 0.1),
        'colsample_bynode': hp.quniform('colsample_bynode', 0.1, 1.0, 0.1),
        'learning_rate': hp.quniform('learning_rate', 0.015, 0.15, 0.005),
        'gamma':  hp.choice('gamma', np.arange(0, 3, dtype=int)),
        #'eval_metric': 'auc',
        'early_stopping_rounds':50,
        #'num_parallel_tree':50, # no need
        'nthread': 900,
        'updater':'grow_gpu_hist',
        #'n_jobs': 10, # gpu pred doesnot need n_jobs
        'silent': 1,
        'predictor':'gpu_predictor',
        'tree_method': 'gpu_hist',
        'verbosity':0,
        'missing':np.nan,
        'sampling_method':'gradient_based', # works only with gpu_hist
        'seed': randSeed
         }
    elif(selected_classfier=='lightgbm'):
        
        space = {
                       
            'num_leaves': hp.quniform('num_leaves', 20, 800, 10),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 0.95, 0.015),
            'learning_rate':hp.quniform('learning_rate', 0.015, 0.22, 0.005),
            'min_child_samples': hp.quniform('min_child_samples', 2, 100,2),
            'lambda_l1': hp.lognormal('lambda_l1', 1e-8, 3.0),
            'lambda_l2':  hp.lognormal('lambda_l2', 1e-8, 3.0),
            'bagging_freq': hp.quniform('bagging_freq', 2, 50,1),
            'subsample': hp.quniform('subsample', 0.1, 0.95, 0.015),
            'device': 'cpu',
            'first_metric_only':False,
            'n_jobs':-1,
            'eval_names': 'hackathon',
            'tree_learner': 'serial',
            'random_state': randSeed} 
        # 600 runs

    elif(selected_classfier=='RF'):
        space= { 
            'n_estimators': hp.quniform('n_estimators', 50, 1500, 20), #number of trees, change it to 1000 for better results
            'criterion':hp.choice('criterion', ['gini','entropy']),
            'max_depth':  hp.choice('max_depth', np.arange(2, 20, dtype=int)),
            'min_samples_split': hp.quniform('min_samples_split', 2, 200,2),
            'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5,1),
            'min_weight_fraction_leaf': hp.quniform('min_weight_fraction_leaf', 0.0, 0.48,0.01),
            'max_features':hp.choice('max_features',['auto', 'sqrt', 'log2']),
            'min_impurity_decrease': hp.quniform('min_impurity_decrease', 0.0, 0.9,0.01),
            'oob_score':hp.choice('oob_score',[False, True]),
            'max_samples': hp.quniform('max_samples', 0.01, 0.99,0.01)  
          
        }
       
    else:
        print('Only xgboost, lightgbm and randomforest are supported: RF,lightgbm,xgboost')
        
        
    return space

In [38]:
df_train.shape

(19915, 86)

In [20]:
df_train['target'].value_counts()

3    5160
7    4405
1    3448
5    2833
2    1817
6    1355
0     648
8     152
4      97
Name: target, dtype: int64

#### Pre processing of data

In [21]:
df_names = df_train.columns.tolist()
df_names.remove('target')
df_names.remove('flag_train')
df_names.remove('myflag')
df_names.remove('genetic_disorder')
df_names.remove('disorder_subclass')
df_names.remove('genetic_disorder_old')
df_names.remove('disorder_subclass_v1')
df_names.remove('genetic_disorder_v1')
df_names.remove('patient_id')


In [23]:
#df_train.to_pickle('./data_disorder_subclass_train.pkl')
#df_valid.to_pickle('./data_disorder_subclass_score.pkl')

In [32]:
df_train1 = pd.read_pickle('./data_disorder_subclass_train.pkl')
df_valid1 = pd.read_pickle('./data_disorder_subclass_score.pkl')

In [33]:
drop_col = ['myflag','flag_train', 'genetic_disorder', 'disorder_subclass','genetic_disorder_old', 'disorder_subclass_v1', 'genetic_disorder_v1','patient_id',]
df_train = df_train1.drop(columns=drop_col).copy()
df_valid = df_valid1.drop(columns=drop_col).copy()

In [34]:
X_score = df_valid.drop(['target','pred_multi'],axis=1) # Test data to be submitted, target data is set to -1

In [36]:
df_train.to_csv("train_1.csv",index=False)
X_score.to_csv("X_score.csv",index=False)

In [37]:
import datetime

x = datetime.datetime.now()
print(x)

2021-08-01 16:23:48.979138


In [38]:
# predict the majority class
def majorityPredictor(arr):    
    lst = arr.tolist()
    occ_= max(lst,key=lst.count)
    return occ_

In [39]:
def myClassfier_multi(selected_classfier,params,X_train, y_train,X_valid,y_valid,X_score): 
      
    X_train = X_train.to_numpy()
    X_valid = X_valid.to_numpy()
    X_score = X_score.to_numpy()
    if(selected_classfier=='xgboost'):
        clf = XGBClassifier(#colsample_bytree =           params[ 'colsample_bytree'], 
                                #colsample_bylevel =      params['colsample_bylevel'],
                                #colsample_bynode =       params['colsample_bynode'],                                                            
                                #gamma =                  params['gamma'],
                                learning_rate =          params['learning_rate'], 
                                max_depth =              int(params[ 'max_depth']), 
                                min_child_weight =       int(params[ 'min_child_weight']), 
                                n_estimators =           int(params[ 'n_estimators']), 
                                nthread =                int(params[ 'nthread']), 
                                objective =              'multi:softmax', 
                                seed =                   params[ 'seed'], 
                                #silent =                 params[ 'silent'], 
                                subsample =              params[ 'subsample'], 
                                verbosity  =             params[ 'verbosity'], 
                                early_stopping_rounds =  int(params[ 'early_stopping_rounds']), 
                                predictor =              params[ 'predictor'], 
                                tree_method =            params[ 'tree_method'],
                                updater =                params['updater'],
                                #num_parallel_tree=50,
                                #n_jobs      =            -1,
                                sampling_method =        params['sampling_method'],
                                #missing =                np.nan,
                                use_label_encoder=False)
        #clf.set_params(**params)
        #print(clf)
    #    predictions = gbm_model.predict(dvalid,
    #                                    ntree_limit=gbm_model.best_iteration + 1)
    #    predictions = (predictions>0.5)
        clf.fit(X_train,y_train,verbose=0)
    elif(selected_classfier =='lightgbm'):
          
        clf = LGBMClassifier(objective              = 'multiclass',
                            #categorical_feature      = cat_cols,
                            #n_estimators             = int(params[ 'n_estimators']),
                            num_leaves               = int(params[ 'num_leaves']),
                            #max_bin = 64,
                            colsample_bytree         = params[ 'colsample_bytree'],
                            learning_rate            = params[ 'learning_rate'],
                            #early_stopping_round    =int( params[ 'early_stopping_round']),
                            #max_depth                = 30,
                            min_child_samples        = int(params[ 'min_child_samples']) ,
                            #lambda_l1                = params['lambda_l1'],
                            #lambda_l2                = params['lambda_l2'],
                            subsample                = params['subsample'],
                            bagging_freq             = int(params['bagging_freq']),
                            verbosity                = -1,
                            device                   = params[ 'device'],
                            first_metric_only        = params[ 'first_metric_only'],
                            #eval_metric              = params[ 'eval_metric'],
                            #metric                   = params[ 'metric'],
                            n_jobs                   = int(params[ 'n_jobs']),
                            #eval_names               = params[ 'eval_names'],
                            random_state             = int(params[ 'random_state']),
                            tree_learner            = params[ 'tree_learner']) 
        
        clf.fit(X_train,y_train,verbose=0) 
    elif(selected_classfier =='RF'):
          
        clf = RandomForestClassifier( 
                            n_estimators              = int(params['n_estimators']),
                            criterion                 = params['criterion'],
                            max_depth                 = int(params['max_depth']),
                            min_samples_split         = int(params['min_samples_split']),            
                            min_samples_leaf          = int(params['min_samples_leaf']),            
                            min_weight_fraction_leaf  = params['min_weight_fraction_leaf'],     
                            max_features              = params['max_features'],    
                            min_impurity_decrease     = params['min_impurity_decrease'],     
                            oob_score                 = params['oob_score'],
                            max_samples               = params['max_samples']
                            )

        clf.fit(X_train,y_train)
        
    y_pred= clf.predict(X_valid)
    y_score_pred = clf.predict(X_score)
    accuracy_1=accuracy_score(y_valid,y_pred)
    del clf
    gc.collect()
            
            
    return accuracy_1,y_score_pred

In [40]:
## Function to create training and validation data based on Stratified Kfold Strategy.
## Outputs the predictions on test data and mean training accuracy score across K folds
def getPredictors_multi(selected_classfier,params,df_train,X_score,n_splits,seed_): 
    track_accuracy=[]
    y_score_tot = []   
    folds=StratifiedKFold(n_splits=int(n_splits), shuffle=True, random_state=seed_)
    for train_index, test_index in folds.split(df_train,df_train['target']):
        split_trn, split_val = df_train.iloc[train_index], df_train.iloc[test_index]
        y_train = split_trn['target']
        X_train = split_trn.drop(['target'],axis=1)
        y_valid = split_val['target']
        X_valid = split_val.drop(['target'],axis=1)
        accuracy_1,y_pred_score = myClassfier_multi(selected_classfier,params,X_train, y_train,X_valid,y_valid,X_score)
        track_accuracy.append(accuracy_1)
        y_score_tot.append(y_pred_score)
        
    mean_accuracy_=  np.mean(track_accuracy)
    print("Mean Accuracy: "+str(mean_accuracy_))
    
    final=pd.DataFrame(y_score_tot).T
   
    return final,mean_accuracy_

In [46]:
param_score=[]
## Creating a loss function for Hyper parameter tuning and saving  hyper parameter values and the corresponding accuracy
##  for the defined parameters
def score_multi(params):
    print(params)
    y_pred_score_multi,multi_acc_ = getPredictors_multi(selected_classfier,params,
                                                                                 df_train,X_score,n_splits=5,
                                                                                 seed_=randSeed)
    
    pd.DataFrame([selected_classfier,params,multi_acc_,
                  datetime.now().strftime('%Y_%m_%d_%H_%M_%S')]).T.to_csv('hyper_multi.csv',mode='a', header=False)

    return -multi_acc_

In [44]:
#import hyperopt as hp
from hyperopt import hp
from hyperopt import fmin, tpe
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
import xgboost as xgb
from  lightgbm import LGBMClassifier
import xgboost as XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score, log_loss, roc_auc_score, accuracy_score,roc_curve,precision_recall_curve
from datetime import datetime
import gc
gc.collect()

6593

In [47]:
## Code for hyper paramter tuning  using Light GBM as the classifier and 1000 evaluations
selected_classfier='lightgbm'
space = parameterSpace(selected_classfier)
       
    # Use the fmin function from Hyperopt to find the best hyperparameters
best_lightgbm = fmin(score_multi, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=100,rstate= np.random.RandomState(randSeed))

{'bagging_freq': 15.0, 'colsample_bytree': 0.765, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 1.2785178546988023, 'lambda_l2': 24.28210493447922, 'learning_rate': 0.17500000000000002, 'min_child_samples': 44.0, 'n_jobs': -1, 'num_leaves': 340.0, 'random_state': 17, 'subsample': 0.825, 'tree_learner': 'serial'}
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
Mean Accuracy: 0.48516193823750947             

bagging_freq is set=38, subsample_freq=0 will be ignored. Current value: bagging_freq=38                               
Mean Accuracy: 0.6762239517951294                                                                                      
{'bagging_freq': 17.0, 'colsample_bytree': 0.255, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 1.484935638644304, 'lambda_l2': 0.20758511163197, 'learning_rate': 0.17500000000000002, 'min_child_samples': 12.0, 'n_jobs': -1, 'num_leaves': 540.0, 'random_state': 17, 'subsample': 0.87, 'tree_learner': 'serial'}
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be i

bagging_freq is set=21, subsample_freq=0 will be ignored. Current value: bagging_freq=21                               
bagging_freq is set=21, subsample_freq=0 will be ignored. Current value: bagging_freq=21                               
Mean Accuracy: 0.6897313582726589                                                                                      
{'bagging_freq': 23.0, 'colsample_bytree': 0.21, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 142.2037375347965, 'lambda_l2': 43.015645618287195, 'learning_rate': 0.18, 'min_child_samples': 16.0, 'n_jobs': -1, 'num_leaves': 110.0, 'random_state': 17, 'subsample': 0.21, 'tree_learner': 'serial'}
bagging_freq is set=23, subsample_freq=0 will be ignored. Current value: bagging_freq=23                               
bagging_freq is set=23, subsample_freq=0 will be ignored. Current value: bagging_freq=23                               
bagging_freq is set=23, subsample_freq=0 will be ignored. Curren

bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
Mean Accuracy: 0.6409239266884258                                                                                      
{'bagging_freq': 21.0, 'colsample_bytree': 0.63, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.014497072794450847, 'lambda_l2': 0.6004778465514929, 'learning_rate': 0.195, 'min_child_samples': 26.0, 'n_jobs': -1, 'num_leaves': 340.0, 'random_state': 17, 'subsample': 0.9299999999999999, 'tree_learner': 'serial'}
bagging_freq is set=21, subsample_freq=0 will be ignored. Current value: bagging_freq=21                               
bagging_freq is set=21, subsample_freq=0 will 

bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6                                 
bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6                                 
bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6                                 
bagging_freq is set=6, subsample_freq=0 will be ignored. Current value: bagging_freq=6                                 
Mean Accuracy: 0.6780818478533768                                                                                      
{'bagging_freq': 27.0, 'colsample_bytree': 0.75, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.06821773277197171, 'lambda_l2': 0.0001699345682143515, 'learning_rate': 0.025, 'min_child_samples': 100.0, 'n_jobs': -1, 'num_leaves': 630.0, 'random_state': 17, 'subsample': 0.44999999999999996, 'tree_learner': 'serial'}
bagging_freq is set=27, subsample_freq=0 w

bagging_freq is set=36, subsample_freq=0 will be ignored. Current value: bagging_freq=36                               
bagging_freq is set=36, subsample_freq=0 will be ignored. Current value: bagging_freq=36                               
bagging_freq is set=36, subsample_freq=0 will be ignored. Current value: bagging_freq=36                               
bagging_freq is set=36, subsample_freq=0 will be ignored. Current value: bagging_freq=36                               
bagging_freq is set=36, subsample_freq=0 will be ignored. Current value: bagging_freq=36                               
Mean Accuracy: 0.6944514185287473                                                                                      
{'bagging_freq': 34.0, 'colsample_bytree': 0.735, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.04488287536525709, 'lambda_l2': 8.241787721817445e-09, 'learning_rate': 0.015, 'min_child_samples': 94.0, 'n_jobs': -1, 'num_leaves': 710.0, 'r

bagging_freq is set=12, subsample_freq=0 will be ignored. Current value: bagging_freq=12                               
bagging_freq is set=12, subsample_freq=0 will be ignored. Current value: bagging_freq=12                               
bagging_freq is set=12, subsample_freq=0 will be ignored. Current value: bagging_freq=12                               
bagging_freq is set=12, subsample_freq=0 will be ignored. Current value: bagging_freq=12                               
bagging_freq is set=12, subsample_freq=0 will be ignored. Current value: bagging_freq=12                               
Mean Accuracy: 0.6829525483304042                                                                                      
{'bagging_freq': 13.0, 'colsample_bytree': 0.105, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.11189074306969721, 'lambda_l2': 2.0478916418671492e-09, 'learning_rate': 0.12, 'min_child_samples': 42.0, 'n_jobs': -1, 'num_leaves': 700.0, 'r

bagging_freq is set=22, subsample_freq=0 will be ignored. Current value: bagging_freq=22                               
bagging_freq is set=22, subsample_freq=0 will be ignored. Current value: bagging_freq=22                               
bagging_freq is set=22, subsample_freq=0 will be ignored. Current value: bagging_freq=22                               
bagging_freq is set=22, subsample_freq=0 will be ignored. Current value: bagging_freq=22                               
bagging_freq is set=22, subsample_freq=0 will be ignored. Current value: bagging_freq=22                               
Mean Accuracy: 0.6815967863419533                                                                                      
{'bagging_freq': 8.0, 'colsample_bytree': 0.87, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.6030837483566387, 'lambda_l2': 0.00041234962667038833, 'learning_rate': 0.08, 'min_child_samples': 2.0, 'n_jobs': -1, 'num_leaves': 220.0, 'rando

bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15                               
Mean Accuracy: 0.46829023349234244                                                                                     
{'bagging_freq': 5.0, 'colsample_bytree': 0.585, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.23249358763726408, 'lambda_l2': 0.0017626507459207338, 'learning_rate': 0.055, 'min_child_samples': 54.0, 'n_jobs': -1, 'num_leaves': 480.0, 'ra

bagging_freq is set=10, subsample_freq=0 will be ignored. Current value: bagging_freq=10                               
bagging_freq is set=10, subsample_freq=0 will be ignored. Current value: bagging_freq=10                               
bagging_freq is set=10, subsample_freq=0 will be ignored. Current value: bagging_freq=10                               
bagging_freq is set=10, subsample_freq=0 will be ignored. Current value: bagging_freq=10                               
bagging_freq is set=10, subsample_freq=0 will be ignored. Current value: bagging_freq=10                               
Mean Accuracy: 0.5998493597790611                                                                                      
{'bagging_freq': 39.0, 'colsample_bytree': 0.645, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 88.20945420967928, 'lambda_l2': 3.7138029100257863e-06, 'learning_rate': 0.095, 'min_child_samples': 30.0, 'n_jobs': -1, 'num_leaves': 230.0, 'ra

bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
Mean Accuracy: 0.6880743158423299                                                                                      
{'bagging_freq': 13.0, 'colsample_bytree': 0.615, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.22329347996891272, 'lambda_l2': 42601.253264234525, 'learning_rate': 0.115, 'min_child_samples': 68.0, 'n_jobs': -1, 'num_leaves': 30.0, 'rando

bagging_freq is set=49, subsample_freq=0 will be ignored. Current value: bagging_freq=49                               
bagging_freq is set=49, subsample_freq=0 will be ignored. Current value: bagging_freq=49                               
bagging_freq is set=49, subsample_freq=0 will be ignored. Current value: bagging_freq=49                               
bagging_freq is set=49, subsample_freq=0 will be ignored. Current value: bagging_freq=49                               
bagging_freq is set=49, subsample_freq=0 will be ignored. Current value: bagging_freq=49                               
Mean Accuracy: 0.6062264624654783                                                                                      
{'bagging_freq': 29.0, 'colsample_bytree': 0.27, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.003637800899214282, 'lambda_l2': 0.0036201447734319886, 'learning_rate': 0.025, 'min_child_samples': 14.0, 'n_jobs': -1, 'num_leaves': 610.0, 'r

bagging_freq is set=20, subsample_freq=0 will be ignored. Current value: bagging_freq=20                               
bagging_freq is set=20, subsample_freq=0 will be ignored. Current value: bagging_freq=20                               
bagging_freq is set=20, subsample_freq=0 will be ignored. Current value: bagging_freq=20                               
bagging_freq is set=20, subsample_freq=0 will be ignored. Current value: bagging_freq=20                               
bagging_freq is set=20, subsample_freq=0 will be ignored. Current value: bagging_freq=20                               
Mean Accuracy: 0.653427065026362                                                                                       
{'bagging_freq': 4.0, 'colsample_bytree': 0.44999999999999996, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.2900619201788884, 'lambda_l2': 1.2727240298071303e-05, 'learning_rate': 0.2, 'min_child_samples': 98.0, 'n_jobs': -1, 'num_leaves'

bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
bagging_freq is set=17, subsample_freq=0 will be ignored. Current value: bagging_freq=17                               
Mean Accuracy: 0.6160682902334924                                                                                      
{'bagging_freq': 36.0, 'colsample_bytree': 0.8099999999999999, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.07251787181134553, 'lambda_l2': 1.27505067779687e-08, 'learning_rate': 0.02, 'min_child_samples': 96.0, 'n_jobs': -1, 'num_leaves'

bagging_freq is set=43, subsample_freq=0 will be ignored. Current value: bagging_freq=43                               
bagging_freq is set=43, subsample_freq=0 will be ignored. Current value: bagging_freq=43                               
bagging_freq is set=43, subsample_freq=0 will be ignored. Current value: bagging_freq=43                               
bagging_freq is set=43, subsample_freq=0 will be ignored. Current value: bagging_freq=43                               
bagging_freq is set=43, subsample_freq=0 will be ignored. Current value: bagging_freq=43                               
Mean Accuracy: 0.6895305046447401                                                                                      
{'bagging_freq': 47.0, 'colsample_bytree': 0.645, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.09965438694634546, 'lambda_l2': 3.386261198249945e-08, 'learning_rate': 0.035, 'min_child_samples': 90.0, 'n_jobs': -1, 'num_leaves': 690.0, 'r

bagging_freq is set=38, subsample_freq=0 will be ignored. Current value: bagging_freq=38                               
bagging_freq is set=38, subsample_freq=0 will be ignored. Current value: bagging_freq=38                               
bagging_freq is set=38, subsample_freq=0 will be ignored. Current value: bagging_freq=38                               
bagging_freq is set=38, subsample_freq=0 will be ignored. Current value: bagging_freq=38                               
bagging_freq is set=38, subsample_freq=0 will be ignored. Current value: bagging_freq=38                               
Mean Accuracy: 0.6902837057494351                                                                                      
{'bagging_freq': 42.0, 'colsample_bytree': 0.57, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.047403148472708136, 'lambda_l2': 9.128305758873292e-08, 'learning_rate': 0.04, 'min_child_samples': 98.0, 'n_jobs': -1, 'num_leaves': 650.0, 'ra

bagging_freq is set=47, subsample_freq=0 will be ignored. Current value: bagging_freq=47                               
bagging_freq is set=47, subsample_freq=0 will be ignored. Current value: bagging_freq=47                               
bagging_freq is set=47, subsample_freq=0 will be ignored. Current value: bagging_freq=47                               
bagging_freq is set=47, subsample_freq=0 will be ignored. Current value: bagging_freq=47                               
bagging_freq is set=47, subsample_freq=0 will be ignored. Current value: bagging_freq=47                               
Mean Accuracy: 0.6772784333417022                                                                                      
{'bagging_freq': 38.0, 'colsample_bytree': 0.48, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.019159149471646444, 'lambda_l2': 7.157350121706578e-08, 'learning_rate': 0.055, 'min_child_samples': 70.0, 'n_jobs': -1, 'num_leaves': 550.0, 'r

bagging_freq is set=35, subsample_freq=0 will be ignored. Current value: bagging_freq=35                               
bagging_freq is set=35, subsample_freq=0 will be ignored. Current value: bagging_freq=35                               
bagging_freq is set=35, subsample_freq=0 will be ignored. Current value: bagging_freq=35                               
bagging_freq is set=35, subsample_freq=0 will be ignored. Current value: bagging_freq=35                               
bagging_freq is set=35, subsample_freq=0 will be ignored. Current value: bagging_freq=35                               
Mean Accuracy: 0.6319859402460457                                                                                      
{'bagging_freq': 50.0, 'colsample_bytree': 0.21, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.7886076932593865, 'lambda_l2': 1.560468608322331e-07, 'learning_rate': 0.015, 'min_child_samples': 62.0, 'n_jobs': -1, 'num_leaves': 800.0, 'ran

bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
bagging_freq is set=31, subsample_freq=0 will be ignored. Current value: bagging_freq=31                               
Mean Accuracy: 0.6968114486567913                                                                                      
{'bagging_freq': 30.0, 'colsample_bytree': 0.435, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 5.964725617816351, 'lambda_l2': 3.520962880517998e-07, 'learning_rate': 0.04, 'min_child_samples': 58.0, 'n_jobs': -1, 'num_leaves': 140.0, 'rand

bagging_freq is set=39, subsample_freq=0 will be ignored. Current value: bagging_freq=39                               
bagging_freq is set=39, subsample_freq=0 will be ignored. Current value: bagging_freq=39                               
bagging_freq is set=39, subsample_freq=0 will be ignored. Current value: bagging_freq=39                               
bagging_freq is set=39, subsample_freq=0 will be ignored. Current value: bagging_freq=39                               
bagging_freq is set=39, subsample_freq=0 will be ignored. Current value: bagging_freq=39                               
Mean Accuracy: 0.6811448656791363                                                                                      
{'bagging_freq': 36.0, 'colsample_bytree': 0.6599999999999999, 'device': 'cpu', 'eval_names': 'hackathon', 'first_metric_only': False, 'lambda_l1': 0.002036288010681879, 'lambda_l2': 2.188241337238082e-07, 'learning_rate': 0.02, 'min_child_samples': 74.0, 'n_jobs': -1, 'num_leave

bagging_freq is set=37, subsample_freq=0 will be ignored. Current value: bagging_freq=37                               
bagging_freq is set=37, subsample_freq=0 will be ignored. Current value: bagging_freq=37                               
bagging_freq is set=37, subsample_freq=0 will be ignored. Current value: bagging_freq=37                               
bagging_freq is set=37, subsample_freq=0 will be ignored. Current value: bagging_freq=37                               
bagging_freq is set=37, subsample_freq=0 will be ignored. Current value: bagging_freq=37                               
Mean Accuracy: 0.5582224453929199                                                                                      
100%|█████████████████████████████████████████████| 100/100 [42:08<00:00, 25.28s/trial, best loss: -0.6984182776801406]


In [48]:
best_lightgbm

{'bagging_freq': 42.0,
 'colsample_bytree': 0.57,
 'lambda_l1': 0.047403148472708136,
 'lambda_l2': 9.128305758873292e-08,
 'learning_rate': 0.04,
 'min_child_samples': 98.0,
 'num_leaves': 650.0,
 'subsample': 0.855}

In [50]:
lgbm_params={'bagging_freq': 42.0, 'colsample_bytree': 0.57, 'device': 'cpu', 'eval_names': 'tanzu',
             'first_metric_only': False, 'lambda_l1': 0.047403148472708136, 'lambda_l2': 9.128305758873292e-08, 
             'learning_rate': 0.04, 'max_depth': -1, 'min_child_samples': 98.0, 'n_jobs': -1, 'num_leaves': 650.0,
             'objective': 'multiclass', 'random_state': 17, 'subsample': 0.855, 'tree_learner': 'serial'}
selected_classfier='lightgbm'

selected_classfier='lightgbm'
y_pred_score_multi,mean_accuracy_= getPredictors_multi(selected_classfier,lgbm_params,df_train,X_score,
                                                        n_splits=5,seed_=randSeed)

Mean Accuracy: 0.6984182776801406


In [51]:
y_score_multi_pred= y_pred_score_multi.astype(int).apply(majorityPredictor,axis=1) 
y_score_multi_pred.value_counts()

3    4395
2    3213
7    1305
5     243
1     195
6      77
8      25
0      12
dtype: int64

In [69]:
multi_class_df=pd.DataFrame()
multi_class_df['class_pred_lgbm_subclass'] = y_score_multi_pred
multi_class_df['lgbm_class'] = multi_class_df['class_pred_lgbm_subclass'].apply(lambda x:class_decoding[x])
#multi_class_df['patient_id'] = df_valid.patient_id.values
multi_class_df.head()

Unnamed: 0,class_pred_lgbm_subclass,lgbm_class
0,2,Diabetes
1,7,Mitochondrial myopathy
2,7,Mitochondrial myopathy
3,3,Leigh syndrome
4,2,Diabetes


#### XGBoost Undersampling

In [59]:
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler,BorderlineSMOTE,SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pickle
from collections import Counter

In [60]:
Counter(df_train['target'])

Counter({0: 648,
         1: 3448,
         2: 1817,
         3: 5160,
         4: 97,
         5: 2833,
         6: 1355,
         7: 4405,
         8: 152})

In [61]:
sampling_str1 = {0: 648,1: 3448,2: 1817,3: 5160,4: 97,5: 2833,6: 1355,7: 4405,8: 152}
sampling_str2 = {0: 90,1: 90,2: 90,3: 90,4: 90,5: 90,6: 90,7: 90,8: 90}
sampling_str3 = {0: 600,1: 3000,2: 1500,3: 3000,4: 90,5: 2500,6: 1300,7: 4000,8: 150}
sampling_str4 = {0: 100,1: 344,2: 181,3: 516,4: 50,5: 283,6: 134,7: 440,8: 50}

In [62]:
rfNms = df_train.columns.to_list()
rfNms.remove('target')

In [63]:
import xgboost as xgb
def xgb_train_on_sampling_one_fold(training,sampling,i,valid=None):
    
    results = []
    myTgt = 'target'

    under2 = RandomUnderSampler(sampling_strategy=sampling,random_state=42)
    over = RandomOverSampler(random_state=42)
    pipe = Pipeline(steps=[('u', under2),('o', over)])

    X_resampled, y_resampled = pipe.fit_resample(training[rfNms], training[myTgt])

    clf = xgb.XGBClassifier( 
            objective='multi:softprob',
             learning_rate = 0.02,
           subsample = 0.35,
           colsample_bytree = 0.7,
           min_child_weight = 40,
           gamma = 10,
#         n_estimators= 2500,
           max_depth = 3)
        
    h = clf.fit(X_resampled, y_resampled)
    pickle.dump(clf, open(f'xgb_anomality_model1_{i}.pkl', 'wb'))

In [64]:
sampling_list = [sampling_str1,sampling_str2,sampling_str3,sampling_str4]
for i,d in enumerate(sampling_list):
    xgb_train_on_sampling_one_fold(df_train,sampling=d,i=i)



In [70]:
def xgb_evaluate(folds):
    preds = []
        
    for fold in range(folds):
        model_xgb = pickle.load(open(f'xgb_anomality_model1_{fold}.pkl', 'rb'))
        preds.append(model_xgb.predict_proba(X_score[rfNms]))
    return np.asarray(preds)
xgb_preds = xgb_evaluate(4)  

In [71]:
from scipy.stats import hmean

preds = hmean(xgb_preds,axis=0)
preds = preds.argmax(axis=1)
len(preds)

9465

In [72]:
#multi_class_df=pd.DataFrame()
multi_class_df['xgb_us_pred'] = preds
multi_class_df['xgb_us_class'] = multi_class_df['xgb_us_pred'].apply(lambda x:class_decoding[x])
#multi_class_df.to_csv('multi_df_final.csv',index=False)

In [73]:
multi_class_df.head()

Unnamed: 0,class_pred_lgbm_subclass,lgbm_class,xgb_us_pred,xgb_us_class
0,2,Diabetes,8,Alzheimer's
1,7,Mitochondrial myopathy,7,Mitochondrial myopathy
2,7,Mitochondrial myopathy,7,Mitochondrial myopathy
3,3,Leigh syndrome,3,Leigh syndrome
4,2,Diabetes,2,Diabetes


##### Random Forest Model

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler 

In [76]:
## Train and hold out sample
myTgt = 'target'
y1 = df_train[myTgt]
X_train_f, X_valid_f, y_train, y_valid = train_test_split(df_train[rfNms], y1, test_size=0.10, random_state=12345, stratify=y1)
print(X_train_f.shape)
print(X_valid_f.shape)
X_train_f.reset_index(drop=True, inplace=True)
X_valid_f.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)

(17923, 77)
(1992, 77)


In [77]:
space = {
    "n_estimators": hp.choice("n_estimators", [100, 250, 500,600,900,1000,1200]),
    #"n_estimators": hp.quniform("n_estimators", 50, 1500, 20), #number of trees, change it to 1000 for better results
    "max_depth": hp.quniform("max_depth", 1, 15,1),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    #"min_samples_split": hp.quniform("min_samples_split", 2, 200,2)
}

In [78]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_f)

In [79]:
def hyperparameter_tuning(params):
    clf = RandomForestClassifier(**params,n_jobs=-1)
    acc = cross_val_score(clf, X_scaled, y_train,scoring="accuracy").mean()
    return {"loss": -acc, "status": STATUS_OK}

In [80]:
# Initialize trials object
from hyperopt import Trials,STATUS_OK
trials = Trials()
best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=10,
    #trials=trials
)

print("Best: {}".format(best))

100%|███████████████████████████████████████████████| 10/10 [04:24<00:00, 26.43s/trial, best loss: -0.6869946702530385]
Best: {'criterion': 1, 'max_depth': 11.0, 'n_estimators': 6}


In [81]:
best

{'criterion': 1, 'max_depth': 11.0, 'n_estimators': 6}

In [82]:
%%time 
pred____ = pd.DataFrame()
pred = pd.DataFrame()
i=0

seed=42
clf =RandomForestClassifier(n_estimators= 1200, max_features='auto', max_depth= 11, class_weight= 'balanced', criterion='entropy', random_state=seed)  #Tuned parameters
kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)  #Making 2 splits to avoid overfitting when predicting for unseen data

cv_score_rf_b =[]
cv_score_rf_f = []

for train_index,test_index in kf.split(X_train_f,y_train.values):
    i+=1
    print('{} of KFold {}'.format(i,kf.n_splits))
    xtr,xvl = X_train_f.loc[train_index],X_train_f.loc[test_index]
    ytr,yvl = y_train.loc[train_index],y_train.loc[test_index]
    
    #model
    clf.fit(xtr,ytr)

    pred[i] = clf.predict(X_valid_f)

    print('Balanced Accuracy on validation set:', balanced_accuracy_score(y_valid,pred[i]))
    cv_score_rf_b.append(balanced_accuracy_score(y_valid,pred[i]))

    print('F1 Score on validation set:',f1_score(y_valid,pred[i],average='macro'))
    cv_score_rf_f.append(f1_score(y_valid,pred[i],average='macro'))  

    pred____[i] = clf.predict(X_score[rfNms])  #prediction on the eval dataset
    

1 of KFold 5
Balanced Accuracy on validation set: 0.6660966038449277
F1 Score on validation set: 0.6913502780022123
2 of KFold 5
Balanced Accuracy on validation set: 0.6400881129194671
F1 Score on validation set: 0.670702542372378
3 of KFold 5
Balanced Accuracy on validation set: 0.6339737586400909
F1 Score on validation set: 0.6585593054825422
4 of KFold 5
Balanced Accuracy on validation set: 0.6426916008726917
F1 Score on validation set: 0.6740068843566619
5 of KFold 5
Balanced Accuracy on validation set: 0.6551540811748433
F1 Score on validation set: 0.6825958523403571
Wall time: 3min 24s


In [83]:
pred['Final_rf'] = pred.mode(axis=1).iloc[:, 0] 

In [84]:
#Plot classfication report for RF
from sklearn.metrics import classification_report
print(classification_report(y_valid,pred['Final_rf']))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78        65
           1       0.89      0.94      0.91       345
           2       0.92      1.00      0.96       182
           3       0.64      0.64      0.64       516
           4       1.00      0.70      0.82        10
           5       0.68      0.67      0.68       283
           6       0.51      0.44      0.47       135
           7       0.60      0.61      0.61       441
           8       1.00      0.13      0.24        15

    accuracy                           0.71      1992
   macro avg       0.79      0.65      0.68      1992
weighted avg       0.71      0.71      0.70      1992



In [85]:
pred____['Final_rf'] = pred____.mode(axis=1).iloc[:, 0] 
pred____.head()

Unnamed: 0,1,2,3,4,5,Final_rf
0,2,2,2,2,2,2
1,7,7,7,7,7,7
2,7,7,7,7,7,7
3,3,3,3,3,3,3
4,2,2,2,2,2,2


In [86]:
#multi_class_df=pd.DataFrame()
multi_class_df['rf_us_pred'] = pred____['Final_rf']
multi_class_df['rf_us_class'] = multi_class_df['rf_us_pred'].apply(lambda x:class_decoding[x])
multi_class_df.to_csv('multi_df_final_sub_class.csv',index=False)

In [87]:
multi_class_df.rf_us_class.value_counts()

Leigh syndrome                         4070
Diabetes                               3219
Mitochondrial myopathy                 1627
Tay-Sachs                               195
Cystic fibrosis                         195
Hemochromatosis                         125
Alzheimer's                              19
Leber's hereditary optic neuropathy      15
Name: rf_us_class, dtype: int64

In [88]:
multi_class_df.xgb_us_class.value_counts()

Mitochondrial myopathy                 3082
Alzheimer's                            1697
Leigh syndrome                         1547
Diabetes                               1541
Leber's hereditary optic neuropathy    1083
Hemochromatosis                         301
Cystic fibrosis                         195
Tay-Sachs                                19
Name: xgb_us_class, dtype: int64

In [89]:
multi_class_df.lgbm_class.value_counts()

Leigh syndrome                         4395
Diabetes                               3213
Mitochondrial myopathy                 1305
Tay-Sachs                               243
Cystic fibrosis                         195
Hemochromatosis                          77
Alzheimer's                              25
Leber's hereditary optic neuropathy      12
Name: lgbm_class, dtype: int64

In [90]:
multi_class_df.head()

Unnamed: 0,class_pred_lgbm_subclass,lgbm_class,xgb_us_pred,xgb_us_class,rf_us_pred,rf_us_class
0,2,Diabetes,8,Alzheimer's,2,Diabetes
1,7,Mitochondrial myopathy,7,Mitochondrial myopathy,7,Mitochondrial myopathy
2,7,Mitochondrial myopathy,7,Mitochondrial myopathy,7,Mitochondrial myopathy
3,3,Leigh syndrome,3,Leigh syndrome,3,Leigh syndrome
4,2,Diabetes,2,Diabetes,2,Diabetes


In [91]:
multi_class_df['pred_multi_sub_class']=multi_class_df[['xgb_us_class','lgbm_class','rf_us_class']].apply(majorityPredictor,axis=1)

In [93]:
multi_class_df.pred_multi_sub_class.value_counts()

Leigh syndrome                         3960
Diabetes                               3201
Mitochondrial myopathy                 1698
Cystic fibrosis                         195
Tay-Sachs                               182
Hemochromatosis                         138
Leber's hereditary optic neuropathy      54
Alzheimer's                              37
Name: pred_multi_sub_class, dtype: int64

In [94]:
genetic_disorder_pred = pd.read_csv("pred_genetic_disorder_phase1.csv")

In [96]:
genetic_disorder_pred['pred_multi_sub_class'] = multi_class_df['pred_multi_sub_class']

In [97]:
genetic_disorder_pred.head()

Unnamed: 0,class_pred_lgbm,patient_id,xgb_us_pred,xgb_us_class,lgbm_class,rf_us_pred,rf_us_class,pred_multi,pred_multi_sub_class
0,1,PID0x4175,2,Multifactorial genetic inheritance disorders,Single-gene inheritance diseases,0.0,Mitochondrial genetic inheritance disorders,Multifactorial genetic inheritance disorders,Diabetes
1,0,PID0x21f5,0,Mitochondrial genetic inheritance disorders,Mitochondrial genetic inheritance disorders,0.0,Mitochondrial genetic inheritance disorders,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
2,0,PID0x49b8,0,Mitochondrial genetic inheritance disorders,Mitochondrial genetic inheritance disorders,0.0,Mitochondrial genetic inheritance disorders,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
3,0,PID0x2d97,0,Mitochondrial genetic inheritance disorders,Mitochondrial genetic inheritance disorders,0.0,Mitochondrial genetic inheritance disorders,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,1,PID0x58da,2,Multifactorial genetic inheritance disorders,Single-gene inheritance diseases,2.0,Multifactorial genetic inheritance disorders,Multifactorial genetic inheritance disorders,Diabetes


In [98]:
genetic_disorder_pred[['patient_id','pred_multi','pred_multi_sub_class']].to_csv('pred_genetic_disorder_phase2_final_submission.csv',index=False)