In [106]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# !pip install xgboost
# !pip install lightgbm
# !pip install imblearn
# !pip install borutashap
# !pip install eli5

In [107]:
time_table = pd.read_csv('Survival_time_event.csv', index_col=0)
clinic_table = pd.read_csv('Clinical_Variables.csv', index_col=0)
genetic_table = pd.read_csv('Genetic_alterations.csv', index_col=0)
survival_treatment_table = pd.read_csv('newLabel.csv', index_col=0)

In [108]:
'''
Correlating numerical features of Time data

- outlier value drop
'''

print('outlier of time: ')
print(time_table.loc[time_table['time'] < 0, 'time'], end='\n\n')

time_table_outlier = time_table.copy()
time_table_outlier.loc[time_table_outlier['time'] < 0, 'time'] = abs(time_table_outlier.loc[time_table_outlier['time'] < 0, 'time'])
print(time_table_outlier.describe(), end='\n\n')

outlier of time: 
905   -7.945621
Name: time, dtype: float64

              time        event
count  1000.000000  1000.000000
mean     51.876125     0.891000
std      22.122689     0.311795
min       7.070708     0.000000
25%      37.401307     1.000000
50%      47.064712     1.000000
75%      60.966476     1.000000
max     217.078908     1.000000



In [109]:
'''
Correlating numerical features of Clinic data

- outlier value drop
'''

clinic_table_outlier = clinic_table.copy()

# drop outlier
for col in clinic_table_outlier.columns:
    for outlier in range(10,13):
        clinic_table_outlier = clinic_table_outlier.replace(outlier, 9)

# visualize
for col in clinic_table_outlier.columns:
    print('#', col)
    print(clinic_table_outlier[col].value_counts())
    print('-'*20)

# Var1
2    235
3    204
1    171
4    139
5     95
0     57
6     50
7     27
8     13
9      9
Name: Var1, dtype: int64
--------------------
# Var2
3    221
2    218
4    163
1    113
5    109
6     65
0     48
7     30
9     20
8     13
Name: Var2, dtype: int64
--------------------
# Var3
2    260
3    196
1    156
4    130
5     97
0     55
6     55
7     23
8     16
9     12
Name: Var3, dtype: int64
--------------------
# Var4
2    242
3    195
1    150
4    140
5    106
6     67
0     36
7     32
8     16
9     16
Name: Var4, dtype: int64
--------------------
# Var5
2    247
3    223
4    161
5    124
1     76
6     63
7     41
0     28
9     19
8     18
Name: Var5, dtype: int64
--------------------
# Var6
2    240
3    212
4    128
1    127
5     99
6     64
0     53
7     40
8     20
9     17
Name: Var6, dtype: int64
--------------------
# Var7
1    269
2    208
3    144
0    128
4    118
5     62
6     47
7     16
8      6
9      2
Name: Var7, dtype: int64
--------------------

In [110]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lgb

In [143]:
def TestML(model,selected_genetic):
    
    genetic_10 = genetic_table.copy()
    genetic_10 = genetic_10[selected_genetic]

    input_dataset = pd.concat([survival_treatment_table, time_table_outlier, clinic_table_outlier ,genetic_10], axis=1)
    input_dataset = input_dataset.drop(['event'], axis=1)

    all_index = np.arange(1000)
    train_data, test_data = input_dataset.iloc[all_index[:800],:], input_dataset.iloc[all_index[800:1000],:]
    X_train = train_data.drop(['newlabel'], axis=1)
    Y_train = train_data['newlabel']

    X_test = test_data.drop(['newlabel'], axis=1)
    Y_test = test_data['newlabel']
    
    model.fit(X_train, Y_train)
    acc_log = round(model.score(X_train, Y_train) * 100, 2)
    train_acc = np.round(np.mean(acc_log), 5)
    print('## Train 정확도: ', train_acc)
    
    Y_pred = model.predict(X_test)
    test_acc = np.round(accuracy_score(Y_test, Y_pred), 5)
    print('## Test 정확도: ', test_acc)
    print()
    
    return train_acc, test_acc

In [160]:
selected_genetic = ['G211', 'G264', 'G179', 'G27', 'G147', 'G139', 'G242', 'G80', 'G263', 'G290']

test_model1 = RandomForestClassifier(n_estimators=200)
TestML(test_model1, selected_genetic)

test_model2 = lgb.LGBMClassifier(n_estimators=30,num_leaves=64,n_jobs=-1,boost_from_average=False)
TestML(test_model2, selected_genetic)

## Train 정확도:  100.0
## Test 정확도:  0.615

## Train 정확도:  93.25
## Test 정확도:  0.635



(93.25, 0.635)

In [161]:
genetic = ['G'+str(i) for i in range(1,301)]

In [162]:
import random

print(random.choices(genetic, k=10))

['G37', 'G285', 'G72', 'G132', 'G137', 'G102', 'G111', 'G18', 'G139', 'G128']


In [166]:
over_genetic = []

test_model1 = RandomForestClassifier(n_estimators=200)

for i in range(100):
    test_genetic = random.sample(genetic, 10)
    print('## 임의의 10개 유전자: ', test_genetic)
    train_accuracy, test_accuracy = TestML(test_model1, test_genetic)
    if test_accuracy > 0.615:
        print('-----------------Over!-----------------', end='\n\n\n')
        over_genetic.append(test_genetic)


## 임의의 10개 유전자:  ['G3', 'G88', 'G87', 'G78', 'G123', 'G143', 'G152', 'G88', 'G143', 'G280']
## Train 정확도:  100.0
## Test 정확도:  0.57

## 임의의 10개 유전자:  ['G175', 'G224', 'G126', 'G19', 'G220', 'G70', 'G92', 'G35', 'G241', 'G114']
## Train 정확도:  100.0
## Test 정확도:  0.57

## 임의의 10개 유전자:  ['G211', 'G55', 'G285', 'G10', 'G150', 'G236', 'G286', 'G192', 'G258', 'G160']
## Train 정확도:  100.0
## Test 정확도:  0.58

## 임의의 10개 유전자:  ['G145', 'G123', 'G69', 'G281', 'G258', 'G246', 'G252', 'G115', 'G27', 'G102']
## Train 정확도:  100.0
## Test 정확도:  0.595

## 임의의 10개 유전자:  ['G65', 'G161', 'G188', 'G240', 'G3', 'G137', 'G116', 'G38', 'G154', 'G196']
## Train 정확도:  100.0
## Test 정확도:  0.55

## 임의의 10개 유전자:  ['G22', 'G110', 'G14', 'G200', 'G248', 'G161', 'G212', 'G26', 'G262', 'G57']
## Train 정확도:  100.0
## Test 정확도:  0.59

## 임의의 10개 유전자:  ['G149', 'G97', 'G103', 'G77', 'G283', 'G79', 'G43', 'G287', 'G288', 'G254']
## Train 정확도:  100.0
## Test 정확도:  0.57

## 임의의 10개 유전자:  ['G180', 'G133', 'G77', 'G231', 'G5

## Train 정확도:  100.0
## Test 정확도:  0.59

## 임의의 10개 유전자:  ['G222', 'G94', 'G241', 'G255', 'G118', 'G1', 'G86', 'G184', 'G57', 'G270']
## Train 정확도:  100.0
## Test 정확도:  0.555

## 임의의 10개 유전자:  ['G138', 'G102', 'G153', 'G280', 'G38', 'G210', 'G299', 'G36', 'G35', 'G139']
## Train 정확도:  100.0
## Test 정확도:  0.565

## 임의의 10개 유전자:  ['G187', 'G34', 'G109', 'G52', 'G235', 'G83', 'G49', 'G114', 'G92', 'G134']
## Train 정확도:  100.0
## Test 정확도:  0.55

## 임의의 10개 유전자:  ['G235', 'G3', 'G167', 'G90', 'G11', 'G63', 'G268', 'G100', 'G231', 'G129']
## Train 정확도:  100.0
## Test 정확도:  0.585

## 임의의 10개 유전자:  ['G108', 'G285', 'G222', 'G152', 'G89', 'G142', 'G289', 'G216', 'G85', 'G93']
## Train 정확도:  100.0
## Test 정확도:  0.595

## 임의의 10개 유전자:  ['G249', 'G138', 'G34', 'G10', 'G105', 'G28', 'G6', 'G203', 'G210', 'G201']
## Train 정확도:  100.0
## Test 정확도:  0.535

## 임의의 10개 유전자:  ['G140', 'G129', 'G81', 'G211', 'G16', 'G68', 'G231', 'G297', 'G200', 'G170']
## Train 정확도:  100.0
## Test 정확도:  0.585

## 임의의 10

In [167]:
print(over_genetic)

[]


In [175]:
over_genetic = []
        
test_model2 = lgb.LGBMClassifier(n_estimators=30,num_leaves=64,n_jobs=-1,boost_from_average=False)

for i in range(100):
    test_genetic = random.sample(genetic, 10)
    print('## 임의의 10개 유전자: ', test_genetic)
    train_accuracy, test_accuracy = TestML(test_model2, test_genetic)
    if train_accuracy > 93.25 or test_accuracy > 0.635:
        print('-----------------Over!-----------------', end='\n\n\n')
        if train_accuracy > 93.25 and test_accuracy > 0.635:
            whatover = 'train&test'
        elif train_accuracy > 93.25:
            whatover = 'train'
        elif test_accuracy > 0.635:
            whatover = 'test'
        over_genetic.append([test_genetic,whatover])

## 임의의 10개 유전자:  ['G51', 'G24', 'G30', 'G235', 'G90', 'G294', 'G180', 'G113', 'G215', 'G28']
## Train 정확도:  93.0
## Test 정확도:  0.59

## 임의의 10개 유전자:  ['G166', 'G4', 'G51', 'G246', 'G84', 'G266', 'G275', 'G273', 'G158', 'G264']
## Train 정확도:  92.12
## Test 정확도:  0.6

## 임의의 10개 유전자:  ['G128', 'G122', 'G188', 'G181', 'G236', 'G104', 'G89', 'G101', 'G243', 'G78']
## Train 정확도:  93.0
## Test 정확도:  0.585

## 임의의 10개 유전자:  ['G200', 'G276', 'G160', 'G289', 'G260', 'G258', 'G118', 'G171', 'G220', 'G270']
## Train 정확도:  93.5
## Test 정확도:  0.625

-----------------Over!-----------------


## 임의의 10개 유전자:  ['G73', 'G11', 'G187', 'G44', 'G26', 'G89', 'G216', 'G265', 'G191', 'G175']
## Train 정확도:  93.25
## Test 정확도:  0.605

## 임의의 10개 유전자:  ['G147', 'G53', 'G3', 'G167', 'G159', 'G266', 'G119', 'G5', 'G41', 'G172']
## Train 정확도:  93.88
## Test 정확도:  0.605

-----------------Over!-----------------


## 임의의 10개 유전자:  ['G300', 'G6', 'G80', 'G95', 'G68', 'G191', 'G245', 'G287', 'G16', 'G114']
## Train 정확도

## Train 정확도:  93.25
## Test 정확도:  0.585

## 임의의 10개 유전자:  ['G225', 'G118', 'G150', 'G24', 'G131', 'G188', 'G155', 'G29', 'G204', 'G139']
## Train 정확도:  93.75
## Test 정확도:  0.58

-----------------Over!-----------------


## 임의의 10개 유전자:  ['G198', 'G17', 'G192', 'G181', 'G287', 'G156', 'G115', 'G121', 'G294', 'G183']
## Train 정확도:  94.0
## Test 정확도:  0.605

-----------------Over!-----------------


## 임의의 10개 유전자:  ['G216', 'G83', 'G182', 'G244', 'G119', 'G177', 'G128', 'G277', 'G91', 'G35']
## Train 정확도:  94.38
## Test 정확도:  0.6

-----------------Over!-----------------


## 임의의 10개 유전자:  ['G297', 'G263', 'G209', 'G275', 'G248', 'G225', 'G163', 'G293', 'G130', 'G66']
## Train 정확도:  92.5
## Test 정확도:  0.61

## 임의의 10개 유전자:  ['G33', 'G113', 'G77', 'G8', 'G285', 'G167', 'G209', 'G10', 'G156', 'G104']
## Train 정확도:  92.75
## Test 정확도:  0.62

## 임의의 10개 유전자:  ['G223', 'G11', 'G23', 'G97', 'G54', 'G102', 'G126', 'G76', 'G83', 'G255']
## Train 정확도:  92.0
## Test 정확도:  0.615

## 임의의 10개 유전자:  [

In [177]:
print(over_genetic)
print(len(over_genetic))

[[['G200', 'G276', 'G160', 'G289', 'G260', 'G258', 'G118', 'G171', 'G220', 'G270'], 'train'], [['G147', 'G53', 'G3', 'G167', 'G159', 'G266', 'G119', 'G5', 'G41', 'G172'], 'train'], [['G300', 'G6', 'G80', 'G95', 'G68', 'G191', 'G245', 'G287', 'G16', 'G114'], 'train'], [['G204', 'G248', 'G200', 'G86', 'G92', 'G9', 'G163', 'G226', 'G165', 'G293'], 'train&test'], [['G264', 'G29', 'G78', 'G14', 'G232', 'G288', 'G151', 'G136', 'G294', 'G144'], 'train'], [['G151', 'G260', 'G239', 'G20', 'G125', 'G188', 'G195', 'G219', 'G64', 'G160'], 'train'], [['G104', 'G216', 'G116', 'G157', 'G3', 'G281', 'G127', 'G250', 'G178', 'G235'], 'train'], [['G151', 'G139', 'G284', 'G41', 'G97', 'G195', 'G263', 'G50', 'G170', 'G34'], 'train'], [['G297', 'G75', 'G64', 'G93', 'G106', 'G127', 'G112', 'G15', 'G240', 'G220'], 'train'], [['G224', 'G254', 'G92', 'G286', 'G61', 'G9', 'G211', 'G227', 'G220', 'G41'], 'train'], [['G145', 'G81', 'G55', 'G111', 'G30', 'G41', 'G136', 'G156', 'G290', 'G17'], 'train'], [['G145', 'G