In [13]:
import pandas as pd
from sklearn import model_selection, metrics
from mml_1sme import MML_1SmE

import ml_utils as mlu

In [2]:
output_path = '../out_1sme_v2_testing/'
prefix = 'exp'
suf_dataForML = '.dataForML.h5'
hdf_key = 'dataForML'

test_percent = 0.1
seed = 2020

algorithms = ["RandomForestClassifier", "LogisticRegression", "SGDClassifier"]
metric = "Balanced_Accuracy"
tune_iterations = 10
manager_mml = MML_1SmE()


In [3]:
df = pd.read_hdf(output_path + prefix + suf_dataForML, key=hdf_key)

In [4]:
columns = list(df.columns)
rows, columns = df.shape
print((rows, columns))

(300, 451)


In [5]:
train_id = None

for i in range(5):
    train_df, test_df = mlu.split_train_test(df, test_percent=test_percent, balanced=True, seed=seed)
    train_iid = list(train_df['ID'])
    
    if train_id is None or train_id == train_iid:
        train_id = train_iid
    else:
        print('Se hace mal')

####################################
Full data set:
            ID  PHENO  snp410_T  snp403_T  snp164_C  snp439_G  snp370_A  \
0      sample1      1         0         0         1         0         0   
1      sample6      1         1         0         0         0         0   
2      sample7      1         0         0         1         1         0   
3     sample11      1         1         0         1         0         0   
4     sample12      1         1         0         0         1         0   
..         ...    ...       ...       ...       ...       ...       ...   
295  sample493      0         1         0         1         0         2   
296  sample494      1         1         0         1         0         2   
297  sample495      1         0         0         0         0         0   
298  sample496      1         2         0         0         0         0   
299  sample498      1         1         0         1         0         0   

     snp389_T  snp475_T  snp399_A  ...  snp90_T

           ID  PHENO  snp410_T  snp403_T  snp164_C  snp439_G  snp370_A  \
0   sample284      0         1         0         0         0         0   
1   sample276      1         1         0         2         0         0   
2   sample445      1         1         0         0         0         2   
3   sample386      0         1         0         2         0         0   
4   sample311      0         0         1         1         0         1   
5   sample132      1         1         0         2         0         1   
6   sample473      1         1         0         0         1         1   
7   sample420      0         1         0         1         0         1   
8   sample253      1         1         0         1         0         1   
9   sample318      1         1         0         1         0         1   
10  sample195      1         1         0         1         0         1   
11  sample266      1         2         0         0         0         0   
12   sample93      0         0        

[240 rows x 451 columns]
####################################
Full data set:
            ID  PHENO  snp410_T  snp403_T  snp164_C  snp439_G  snp370_A  \
0      sample1      1         0         0         1         0         0   
1      sample6      1         1         0         0         0         0   
2      sample7      1         0         0         1         1         0   
3     sample11      1         1         0         1         0         0   
4     sample12      1         1         0         0         1         0   
..         ...    ...       ...       ...       ...       ...       ...   
295  sample493      0         1         0         1         0         2   
296  sample494      1         1         0         1         0         2   
297  sample495      1         0         0         0         0         0   
298  sample496      1         2         0         0         0         0   
299  sample498      1         1         0         1         0         0   

     snp389_T  snp475_

In [6]:
print(len(train_id))
print(train_df.shape)
print(test_df.shape)

240
(240, 451)
(60, 451)


In [7]:
train_id = None

for i in range(5):
    dataForML_train, dataForML_test = model_selection.train_test_split(train_df, test_size=0.3, random_state=seed)
    train_iid = list(train_df['ID'])
    
    if train_id is None or train_id == train_iid:
        train_id = train_iid
    else:
        print('Se hace mal')

In [8]:
print(len(train_id))
print(dataForML_train.shape)
print(dataForML_test.shape)
print((len(dataForML_test[dataForML_test['PHENO']==0]), len(dataForML_test[dataForML_test['PHENO']==1])))

240
(168, 451)
(72, 451)
(8, 64)


In [9]:
dataForML_train.to_hdf(output_path + prefix + '_train' + suf_dataForML, key=hdf_key)
dataForML_test.to_hdf(output_path + prefix + '_test' + suf_dataForML, key=hdf_key)

los expertos se generan en un script a parte con el siguiente comando:  <br>   

manager_mml.gen_experts(path=output_path, expert_names=algorithms, prefix=prefix + '_train', metric_max=metric, seed=seed, metric_tune=metric, max_tune=tune_iterations)


In [10]:
meta_set = manager_mml.gen_metaset(dataForML_test, output_path, algorithms)
meta_set



Unnamed: 0,ID,Pred-RandomForestClassifier,Pred-LogisticRegression,Pred-SGDClassifier,PHENO,snp410_T,snp403_T,snp164_C,snp439_G,snp370_A,...,snp90_T,snp95_A,snp377_G,snp382_T,snp450_A,snp107_A,snp224_C,snp368_G,snp304_C,snp306_G
0,sample206,1,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,2,0,0,0
1,sample438,1,1,1,1,1,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
2,sample264,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,sample79,1,1,1,1,1,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
4,sample494,1,0,1,1,1,0,1,0,2,...,0,1,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,sample360,1,1,1,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
68,sample283,1,1,1,1,2,0,2,0,1,...,0,0,0,1,0,0,1,1,1,0
69,sample312,1,1,1,1,1,0,2,0,1,...,0,0,1,0,0,0,0,1,1,0
70,sample96,1,1,1,1,0,0,1,0,2,...,0,0,1,0,2,0,1,0,0,0


Generar el metamodelo se hace desde un script aparte con el comando: <br>

manager_mml.gen_meta_expert(dataForML_test, output_path, algorithms, prefix + "_meta", metric, algs, seed, metric, tune_iterations)


In [11]:
def prediction_test(test_df):
    Y_test = test_df['PHENO']

    test_predictions = manager_mml.predict(test_df, output_path, algorithms, prefix + '_meta')

    balacc = metrics.balanced_accuracy_score(Y_test, test_predictions)

    return balacc

In [14]:
prediction_test(test_df)



0.5