In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# !pip install xgboost
# !pip install lightgbm
# !pip install imblearn
# !pip install borutashap
plt.rcParams["figure.figsize"] = (20,8)

In [2]:
df1=pd.read_csv('Survival_time_event.csv', index_col=0)
# df2=pd.read_csv('Treatment.csv', index_col=0)
df3=pd.read_csv('Clinical_Variables.csv', index_col=0)
df4=pd.read_csv('Genetic_alterations.csv', index_col=0)
df5=pd.read_csv('newLabel.csv', index_col=0)

In [3]:
'''
Correlating numerical features of Time data

- outlier value drop
'''

print('outlier of time: ')
print(df1.loc[df1['time'] < 0, 'time'], end='\n\n')

df1_outlier = df1.copy()
df1_outlier.loc[df1_outlier['time'] < 0, 'time'] = abs(df1_outlier.loc[df1_outlier['time'] < 0, 'time'])
print(df1_outlier.describe(), end='\n\n')

outlier of time: 
905   -7.945621
Name: time, dtype: float64

              time        event
count  1000.000000  1000.000000
mean     51.876125     0.891000
std      22.122689     0.311795
min       7.070708     0.000000
25%      37.401307     1.000000
50%      47.064712     1.000000
75%      60.966476     1.000000
max     217.078908     1.000000



In [4]:
'''
Correlating numerical features of Clinic data

- outlier value drop
'''

df3_outlier = df3.copy()


# drop outlier
for col in df3_outlier.columns:
    for outlier in range(10,13):
        df3_outlier = df3_outlier.replace(outlier, 9)

# visualize
for col in df3_outlier.columns:
    print('#', col)
    print(df3_outlier[col].value_counts())
    print('-'*20)

# Var1
2    235
3    204
1    171
4    139
5     95
0     57
6     50
7     27
8     13
9      9
Name: Var1, dtype: int64
--------------------
# Var2
3    221
2    218
4    163
1    113
5    109
6     65
0     48
7     30
9     20
8     13
Name: Var2, dtype: int64
--------------------
# Var3
2    260
3    196
1    156
4    130
5     97
0     55
6     55
7     23
8     16
9     12
Name: Var3, dtype: int64
--------------------
# Var4
2    242
3    195
1    150
4    140
5    106
6     67
0     36
7     32
8     16
9     16
Name: Var4, dtype: int64
--------------------
# Var5
2    247
3    223
4    161
5    124
1     76
6     63
7     41
0     28
9     19
8     18
Name: Var5, dtype: int64
--------------------
# Var6
2    240
3    212
4    128
1    127
5     99
6     64
0     53
7     40
8     20
9     17
Name: Var6, dtype: int64
--------------------
# Var7
1    269
2    208
3    144
0    128
4    118
5     62
6     47
7     16
8      6
9      2
Name: Var7, dtype: int64
--------------------

In [5]:
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score

In [6]:
'''
Dataset of Best Accuracy


'''

df = pd.concat([df5, df1_outlier, df3_outlier ,df4], axis=1)
df = df.drop(['event'], axis=1)

In [7]:
'''
Model list


'''

ensemble_models = [
    ('lrcv', LogisticRegression(max_iter = 10000)),
    ('ada', AdaBoostClassifier()),
    ('bc', BaggingClassifier()),
    ('etc',ExtraTreesClassifier()),
    ('gbc', GradientBoostingClassifier()),
    ('rfc', RandomForestClassifier(n_estimators=20)),
    ('knn', KNeighborsClassifier(n_neighbors = 4)),
    ('svc', SVC(probability=True)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ('lgbm', LGBMClassifier()),
    ('dtc', DecisionTreeClassifier()),
    ('gnb',GaussianNB()),
]

models = [VotingClassifier(ensemble_models, voting='soft'),
          LogisticRegression(max_iter = 10000), SVC(), KNeighborsClassifier(n_neighbors = 4), 
          GaussianNB(), Perceptron(),
          SGDClassifier(), 
          DecisionTreeClassifier(), RandomForestClassifier(n_estimators=20)]
#LinearSVC(max_iter=1000000)

In [8]:
'''
Select Model of Best Accuracy


'''
from sklearn.model_selection import StratifiedKFold


def training(model_list):
    best_model = []
    for model in model_list:
        model_name = str(model)[:str(model).find('(')]
        print('Model: ', model_name)
        print()
        features = df.drop(['newlabel'], axis=1)
        labels = df['newlabel']
        
        splits = [5, 10, 7]
        
        for s in splits:
            skfold = StratifiedKFold(n_splits=s)
            idx_iter=0
            cv_accuracy=[]
            cv_precision=[]
            cv_recall=[]
            cv_f1score=[]

            for i in range(20):
                features = features.sample(frac=1).reset_index(drop=True)
                labels = labels.sample(frac=1).reset_index(drop=True)

                for train_index, test_index in skfold.split(features,labels):
                    np.random.shuffle(train_index)
                    np.random.shuffle(test_index)

                    # split train and test set
                    X_train, X_test = features.iloc[train_index,:], features.iloc[test_index,:]
                    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

                    # train ans prediction
                    model.fit(X_train, y_train)
                    pred = model.predict(X_test)

                    idx_iter += 1

                    # 
                    accuracy = np.round(accuracy_score(y_test, pred), 4)
                    cv_accuracy.append(accuracy)

                    precision = np.round(precision_score(y_test, pred, average='weighted', zero_division=0), 4)
                    cv_precision.append(precision)

                    recall = np.round(recall_score(y_test, pred, average='weighted', zero_division=0), 4)
                    cv_recall.append(recall)

                    f1score = np.round(f1_score(y_test, pred, average='weighted', zero_division=0), 4)
                    cv_f1score.append(f1score)

                    #train_size = X_train.shape[0]
                    #test_size = X_test.shape[0]

                    #print('\n#{0} 교차 검증 정확도: {1}, 학습 데이터 크기: {2}, 검증 데이터 크기: {3}'.format(idx_iter, accuracy, train_size, test_size))
                    #print('#{0} 검증 세트 인덱스: MIN{1}, MAX{2}'.format(idx_iter, min(test_index), max(test_index)))

                    #print('학습 레이블 데이터 분포: \n', pd.Series(y_train).value_counts())
                    #print('검증 레이블 데이터 분포: \n', pd.Series(y_test).value_counts())

            print('## 교차 검증 총 횟수: ', len(cv_accuracy), '(분할개수:', s, ')')
            print('## 교차 검증별 정확도: ', np.round(cv_accuracy, 4))
            print('## 평균 검증 정확도: ', np.round(np.mean(cv_accuracy), 5))
            print('## 평균 검증 F1 Score: ', np.round(np.mean(cv_f1score), 5))
            print('##')
            #save model name, split num, acc, pre, rec, f1
            best_model.append([model_name, s, 
                               np.round(np.mean(cv_accuracy), 5), 
                               np.round(np.mean(cv_precision), 5),
                               np.round(np.mean(cv_recall), 5),
                               np.round(np.mean(cv_f1score), 5)])
        print()
        print('-'*100)
        print()

In [9]:
training(models)

Model:  VotingClassifier

## 교차 검증 총 횟수:  100 (분할개수: 5 )
## 교차 검증별 정확도:  [0.48  0.45  0.465 0.445 0.4   0.435 0.435 0.41  0.49  0.465 0.47  0.43
 0.465 0.435 0.46  0.41  0.495 0.375 0.445 0.45  0.415 0.405 0.385 0.425
 0.435 0.47  0.475 0.46  0.44  0.405 0.445 0.485 0.45  0.425 0.43  0.405
 0.49  0.475 0.45  0.485 0.39  0.445 0.46  0.37  0.4   0.48  0.475 0.455
 0.355 0.41  0.41  0.45  0.43  0.41  0.425 0.43  0.46  0.455 0.415 0.44
 0.42  0.415 0.445 0.5   0.42  0.455 0.415 0.4   0.43  0.51  0.48  0.48
 0.485 0.44  0.44  0.405 0.49  0.485 0.445 0.505 0.42  0.425 0.47  0.4
 0.435 0.445 0.425 0.435 0.39  0.445 0.455 0.445 0.48  0.415 0.425 0.45
 0.45  0.455 0.455 0.51 ]
## 평균 검증 정확도:  0.44225
## 평균 검증 F1 Score:  0.41641
##
## 교차 검증 총 횟수:  200 (분할개수: 10 )
## 교차 검증별 정확도:  [0.36 0.48 0.39 0.36 0.45 0.4  0.41 0.47 0.43 0.46 0.43 0.44 0.38 0.5
 0.5  0.51 0.55 0.41 0.5  0.45 0.46 0.44 0.49 0.4  0.43 0.44 0.5  0.48
 0.4  0.49 0.48 0.38 0.49 0.4  0.47 0.46 0.38 0.43 0.42 0.48 0.46 0.47
 0.41 0.5

## 교차 검증 총 횟수:  100 (분할개수: 5 )
## 교차 검증별 정확도:  [0.465 0.435 0.45  0.455 0.41  0.415 0.435 0.465 0.475 0.46  0.37  0.42
 0.475 0.425 0.395 0.425 0.405 0.4   0.425 0.4   0.46  0.425 0.43  0.425
 0.43  0.43  0.455 0.475 0.41  0.43  0.415 0.425 0.49  0.51  0.425 0.47
 0.435 0.44  0.46  0.415 0.435 0.43  0.39  0.47  0.455 0.495 0.415 0.44
 0.455 0.44  0.435 0.455 0.39  0.425 0.46  0.385 0.43  0.43  0.4   0.4
 0.47  0.405 0.4   0.425 0.415 0.4   0.41  0.42  0.405 0.42  0.485 0.375
 0.415 0.425 0.48  0.455 0.46  0.415 0.4   0.425 0.465 0.4   0.415 0.485
 0.42  0.485 0.405 0.46  0.455 0.45  0.48  0.45  0.475 0.47  0.415 0.44
 0.475 0.445 0.48  0.415]
## 평균 검증 정확도:  0.43585
## 평균 검증 F1 Score:  0.41392
##
## 교차 검증 총 횟수:  200 (분할개수: 10 )
## 교차 검증별 정확도:  [0.42 0.42 0.44 0.49 0.38 0.4  0.34 0.46 0.37 0.42 0.49 0.38 0.38 0.39
 0.42 0.39 0.4  0.44 0.43 0.44 0.47 0.41 0.46 0.43 0.52 0.39 0.44 0.38
 0.41 0.42 0.4  0.41 0.42 0.46 0.41 0.49 0.35 0.38 0.4  0.49 0.48 0.45
 0.41 0.41 0.4  0.47 0.4  0.45 0.3

## 교차 검증 총 횟수:  100 (분할개수: 5 )
## 교차 검증별 정확도:  [0.07  0.435 0.41  0.44  0.43  0.44  0.23  0.43  0.445 0.37  0.45  0.47
 0.44  0.09  0.5   0.42  0.41  0.455 0.43  0.435 0.47  0.43  0.46  0.435
 0.42  0.47  0.47  0.245 0.445 0.46  0.44  0.44  0.465 0.52  0.385 0.44
 0.43  0.45  0.185 0.135 0.47  0.38  0.445 0.39  0.4   0.495 0.41  0.44
 0.385 0.44  0.245 0.43  0.45  0.45  0.455 0.405 0.46  0.455 0.43  0.42
 0.445 0.445 0.41  0.495 0.455 0.37  0.435 0.47  0.48  0.455 0.065 0.475
 0.12  0.445 0.45  0.445 0.455 0.195 0.43  0.475 0.47  0.45  0.365 0.43
 0.43  0.07  0.465 0.415 0.455 0.44  0.145 0.405 0.46  0.44  0.46  0.305
 0.45  0.44  0.06  0.36 ]
## 평균 검증 정확도:  0.3998
## 평균 검증 F1 Score:  0.32101
##
## 교차 검증 총 횟수:  200 (분할개수: 10 )
## 교차 검증별 정확도:  [0.52 0.45 0.42 0.34 0.36 0.4  0.44 0.33 0.42 0.42 0.45 0.43 0.36 0.47
 0.36 0.47 0.42 0.38 0.47 0.42 0.31 0.41 0.33 0.46 0.41 0.44 0.38 0.45
 0.28 0.37 0.47 0.46 0.08 0.39 0.45 0.41 0.46 0.19 0.34 0.47 0.45 0.43
 0.44 0.39 0.25 0.47 0.43 0.39 0.2

In [10]:
np_best_model = np.array(best_model)
max_model = np.argmax(np_best_model, axis=0)
print(best_model[max_model,0])

NameError: name 'best_model' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

def modeling(model):
    skfold = StratifiedKFold(n_splits=10)
    cv_accuracy=[]
    cv_f1score=[]

    for i in range(20):
        features = features.sample(frac=1).reset_index(drop=True)
        labels = labels.sample(frac=1).reset_index(drop=True)

        for train_index, test_index in skfold.split(features,label):
            np.random.shuffle(train_index)
            np.random.shuffle(test_index)

            # split train and test set
            X_train, X_test = features.iloc[train_index,:], features.iloc[test_index,:]
            y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

            # train ans prediction
            model.fit(X_train, y_train)
            model.predict(X_test)
            
            accuracy = np.round(accuracy_score(y_test, pred), 4)
            cv_accuracy.append(accuracy)

            f1score = np.round(f1_score(y_test, pred, average='weighted', zero_division=0), 4)
            cv_f1score.append(f1score)

    print('## 교차 검증 총 횟수: ', len(cv_accuracy))
    print('## 평균 검증 정확도: ', np.round(np.mean(cv_accuracy), 5))
    print('## 평균 검증 F1 Score: ', np.round(np.mean(cv_f1score), 5))

modeling(best_model)

In [None]:
'''
Feature selection


'''

from BorutaShap import BorutaShap
import shap
from eli5.lightgbm import *
from eli5.sklearn import *
import eli5

In [None]:
'''
Feature selection Method 1

BorutaShap

'''

Feature_Selector = BorutaShap(model=best_model, importance_measure='shap', classification=True)
Feature_Selector.fit(X=X_train, y=Y_train, n_trials=50, random_state=0)
Feature_Selector.plot(X_size=12, which_features='all', figsize=(48,24))

In [None]:
'''
Feature selection Method 2

TreeExplainer

'''

shap.initjs()
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
fig = shap.summary_plot(shap_values, X_test)

In [None]:
'''
Feature selection Method 3

PermutationImportance

'''

perm = PermutationImportance(best_model, random_state=42).fit(X_test, y_test)
#print(eli5.format_as_text(explain_weights.explain_permutation_importance(perm, feature_names = X_test.columns.values, top=40)))
explain_weights.explain_permutation_importance(perm, feature_names = X_test.columns.values, top=50)