In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
import logistic_regression as lr

In [2]:
# v7
g = pd.read_csv('../../../../../../data/train/train_all_g.csv', index_col=(0,1))
e = pd.read_csv('../../../../../../data/train/train_all_tissues.csv', index_col=(0,1)).dropna(thresh = 3)
train = pd.concat([g,e["median"]], axis = 1).dropna()

In [2]:
# v6p with tissue-specific annotations
g = pd.read_csv('../input/g_train.csv', index_col=(0,1))
e = pd.read_csv('../input/expression.short.csv', index_col=(0,1))
train = pd.concat([g,e["median"]], axis = 1).dropna()

In [None]:
# add discrete training labels
train["labels"] = sklearn.preprocessing.binarize(np.abs(train["median"].values).reshape(-1,1), threshold = 1.5).reshape(-1,1)

In [7]:
##### processing data helper functions #####
def processTissueGroups(tissue_groups_path):
    tissue_groups = {}
    f = open(tissue_groups_path)
    for l in f:
        w = l.strip().split(',')
        group = w[0]
        tissue_groups[group] = []
        for tissue in w[1:]: tissue_groups[group].append(tissue)
    return tissue_groups    
    

def generateTrainTest(train, annotation_columns):
    '''
        Training data contains annotation columns and other data columns
        annotation_columns is a list of genomic annotations
    '''
    annotation_columns.insert(0, 'gene_id')
    train.insert(0, 'gene_id', train.index.get_level_values('gene_id'))
    train.index = train.index.get_level_values('subject_id')

    # boolean mask - mark True for all duplicates and original
    duplicates_bool = train.duplicated(subset = annotation_columns, keep = False)
    # isolate training data w/ no duplicates - complement of boolean mask
    train_nodups = train[~duplicates_bool]
    train_nodups.index = [train_nodups.index, train_nodups['gene_id']]
    train_nodups = train_nodups.drop('gene_id', axis=1)

    # order duplicates consecutively
    duplicates = train[duplicates_bool].sort_values(by = annotation_columns)
    # remove odd duplicates
    duplicates = duplicates.groupby(by = annotation_columns).filter(lambda x: len(x) % 2 == 0)
    duplicates.index = [duplicates.index, duplicates['gene_id']]
    duplicates = duplicates.drop('gene_id', axis=1)
    n1 = duplicates.iloc[::2]
    n2 = duplicates.iloc[1::2]
    return train_nodups, n1, n2

In [4]:
# split train/test and create relevant matrices
#expression_path = '../../../../../../data/train/train_all_tissues.csv'
#annotations_path = '../../../../../../data/train/train_all_g.csv'
#tissue_groups_path = '../tissue_groups/tissue_groups.v7.txt'


# v6p
expression_path = '../input/expression.short.csv'
annotations_path = '../input/g_train.csv'
tissue_groups_path = '../tissue_groups/t3.txt'


train_list, test_list = [], []
tissues = []
annotations = pd.read_csv(annotations_path, index_col=(0,1))
expression = pd.read_csv(expression_path, index_col=(0,1)) 

tissue_groups = processTissueGroups(tissue_groups_path)
for k,v in tissue_groups.items():
    tissues.extend(v)
annot_cols_original = list(annotations.columns)
annot_cols_original.insert(0, 'intercept')
# scale annotations and add intercept
annotations = annotations / (annotations.max() - annotations.min())
annotation_columns = list(annotations.columns)

print ("processed all data...")

#genomeonly_sharedtissue_beta = trainSharedGenomeOnlyModel(annotations, expression)

for group in tissue_groups:
    # identify tissue-specific expression data
    expr_group = expression[tissue_groups[group]]
    # first, limit to samples you want and take median
    if group == 'brain':
        expr_group = expr_group.dropna(thresh = 3)
    elif group == 'group1':
        expr_group = expr_group.dropna(thresh = 4)
    elif len(tissue_groups[group]) == 1:
        expr_group = expr_group.dropna()
    else:
        expr_group = expr_group.dropna(thresh = 2)


    # compute med(abs(z-score)) for each sample
    expr_group["expr_median"] = np.abs(expr_group).median(axis=1)
    # concatenate annotations with expression data
    train = pd.concat([annotations, expr_group["expr_median"]], axis = 1)
    # drop samples with any missing annotations
    train = train.dropna()

    # add binarized expression label
    train["expr_label"] = sklearn.preprocessing.binarize(np.abs(train["expr_median"]).reshape(-1,1), threshold = 1.5)
    # add posterior
    train["posterior"] = 0
    train["tissue"] = str(group)

    train, n1, n2 = generateTrainTest(train, annotation_columns)
    # add intercept
    train.insert(0, 'intercept', 1)
    n1.insert(0, 'intercept', 1)
    n2.insert(0, 'intercept', 1)

    train_list.append(train)
    test_list.append([n1, n2])

    print ("processed ", group, " tissues.")

processed all data...
processed  muscle  tissues.
processed  epithelial  tissues.
processed  digestive  tissues.
processed  brain  tissues.
processed  group1  tissues.


In [8]:
train_list, tissues = [], []
annotations_path = '../input/g_train.csv'
expression_path = '../input/simulated_data/multitask_with_transfer_v2/e.csv'
tissue_groups_path = '../tissue_groups/t3.txt'
annotations = pd.read_csv(annotations_path)
annotations = annotations.drop("subject_id", axis=1)
annotations = annotations.drop("gene_id", axis=1)
expression = pd.read_csv(expression_path, index_col=(0))
tissue_groups = processTissueGroups(tissue_groups_path)
for k,v in tissue_groups.items():
    tissues.extend(v)
annot_cols_original = list(annotations.columns)
annot_cols_original.insert(0, 'intercept')
# scale annotations and add intercept
annotations = annotations / (annotations.max() - annotations.min())
annotation_columns = list(annotations.columns)
c = 0

for group in tissue_groups:
    expr = expression[group]
    expr.name = 'expression'

    # concatenate annotations with expression data
    train = pd.concat([annotations, expr], axis=1).dropna()

    train["expr_label"] = expr
    # add posterior
    train["posterior"] = 0
    train["tissue"] = str(group)
    # add intercept
    train.insert(0, 'intercept', 1)
    train_list.append(train)

In [12]:
for group in tissue_groups:
    print (group)

epithelial
digestive
group1
brain
muscle


In [13]:
train_list[1].head()

Unnamed: 0,intercept,max_CpG_10kb,max_PHRED_10kb,max_verPhCons_10kb,max_cHmmTssA_10kb,max_mamPhCons_10kb,max_verPhyloP_10kb,max_GC_10kb,max_EncOCpolIIPVal_10kb,max_TFBS_10kb,...,E110_general_promoter,E110_general_enhancer,E111_general_promoter,E111_general_enhancer,E113_general_promoter,E113_general_enhancer,expression,expr_label,posterior,tissue
0,1,0.572087,0.376902,0.578286,1.010185,0.560247,0.672983,1.015867,1.184023,0.372614,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,digestive
1,1,0.59292,0.364608,0.582093,1.026185,0.563769,0.676492,1.156892,1.184023,0.372614,...,0.0,1.0,0.0,0.0,0.0,1.0,1,1,0,digestive
2,1,0.801254,0.388538,0.577017,1.687185,0.559807,0.66835,1.451764,1.344023,0.582698,...,1.0,0.0,1.0,0.0,1.0,0.0,0,0,0,digestive
3,1,0.676254,0.42744,0.602396,1.018185,0.584898,0.704711,1.323559,1.184023,0.372614,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,digestive
4,1,0.59292,0.44868,0.583785,1.018185,0.56509,0.733772,1.323559,1.266523,0.372614,...,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,digestive


In [14]:
def bootstrap_resample(X, n=None):
    """ 
    citation: http://nbviewer.jupyter.org/gist/aflaxman/6871948
    Bootstrap resample an array_like
    Parameters
    ----------
    X : array_like
      data to resample
    n : int, optional
      length of resampled array, equal to len(X) if n==None
    Results
    -------
    returns X_resamples
    """
    if n == None:
        n = len(X)
        
    resample_i = np.floor(np.random.rand(n)*len(X)).astype(int)
    X_resample = X.iloc[resample_i]
    return X_resample

In [15]:
def estimateBetaParent(beta_children, lambda_hp_children, lambda_hp_parent, num_tissues):
    '''
        Estimate beta parent 
        beta_j = (2 * \sum_c lambda^c * beta_j^c) / (2*lamda + L * \sum_c lambda^c)
    '''

    return (np.sum((np.array([lambda_hp_children]).T * beta_children), axis = 0)) / (lambda_hp_parent + np.sum(lambda_hp_children))


In [16]:
import sklearn
from sklearn import metrics
def _cross_validate(g, expr_label, beta_init, beta_parent_init, lambda_set):
    '''
        Cross-validate beta MAP estimation to find optimal lambda
    '''
    X = g
    Y = expr_label
    K = 5
    scores_list = np.zeros((len(lambda_set), K))
    for k in range(K):
        training = np.array([x for i, x in enumerate(X) if i % K != k])
        training_labels = np.array([x for i, x in enumerate(Y) if i % K != k])
        validation = np.array([[x for i, x in enumerate(X) if i % K == k]])
        validation_labels = np.array([x for i, x in enumerate(Y) if i % K == k])
        for i in range(len(lambda_set)):
            beta = lr.sgd(training, training_labels, beta_init, beta_parent_init, float(lambda_set[i]))
            scores = lr.log_prob(validation, beta).reshape(-1)
            auc = sklearn.metrics.roc_auc_score(validation_labels, scores)
            print(lambda_set[i], auc)
            scores_list[i][k] = auc
    # average across all folds for each lambda
    lambda_averages = np.mean(scores_list, axis=1)
    print(lambda_averages)
    # sanity check
    assert len(lambda_averages) == len(lambda_set)
    optimal_lambda = lambda_set[np.argmax(lambda_averages)]
    return optimal_lambda

In [15]:
optimal_lambdas

[10.0, 1.0]

In [17]:
K = 10
num_tissues = len(train_list)
# beta is a T x M matrix, where T = # of tissues and M = number of features (not including intercept)
beta = np.zeros((K, num_tissues, len(annot_cols_original) - 1))
beta_parent = np.zeros(len(annot_cols_original) - 1)

delta = np.zeros((K, num_tissues, len(annot_cols_original) - 1))
delta_parent = np.zeros((K, len(annot_cols_original) - 1))
lambda_set = np.array([1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6])
optimal_lambdas = [1, 1, 1, 1, 1]
# determine optimal lambdas on one simulated data set
for j in range(num_tissues):
    print("tissue: ", j)
    train_sample = bootstrap_resample(train_list[j])
    g = train_sample[annot_cols_original].values
    expr_label = train_sample["expr_label"].values
    optimal_lambda = _cross_validate(g, expr_label, np.zeros(len(annot_cols_original)), np.zeros(len(annot_cols_original)), lambda_set)
    optimal_lambdas[j] = optimal_lambda
print(optimal_lambdas)

tissue:  0
1e-06 0.55247783832
1e-05 0.552477814156
0.0001 0.552477875664
0.001 0.55247813048
0.01 0.552485096176
0.1 0.552550370753
1.0 0.552753498471
10.0 0.552772038516
100.0 0.552702779147
1000.0 0.550747210813
10000.0 0.537922989364
100000.0 0.530513416543
1000000.0 0.529555878148
1e-06 0.554340640888
1e-05 0.554340763163
0.0001 0.554341429127
0.001 0.554340791548
0.01 0.554348239431
0.1 0.554399132204
1.0 0.554570081965
10.0 0.55454205906
100.0 0.553064541932
1000.0 0.547593819893
10000.0 0.536102492369
100000.0 0.529345716846
1000000.0 0.528338357412
1e-06 0.54941830458
1e-05 0.549418313341
0.0001 0.549418330862
0.001 0.549417478907
0.01 0.549408280431
0.1 0.549334033399
1.0 0.549253130612
10.0 0.549405512128
100.0 0.549471850661
1000.0 0.546927298127
10000.0 0.537794797032
100000.0 0.531654989464
1000000.0 0.530615855143
1e-06 0.556816765021
1e-05 0.556816815329
0.0001 0.55681670815
0.001 0.556815949144
0.01 0.556816988129
0.1 0.556792225293
1.0 0.556882686968
10.0 0.5570087869

In [18]:
optimal_lambdas

[10.0, 10.0, 100.0, 0.10000000000000001, 1.0000000000000001e-05]

In [12]:
num_tissues

5

In [23]:
beta.shape

(100, 5, 118)

In [19]:
K = 100
num_tissues = len(train_list)
# beta is a T x M matrix, where T = # of tissues and M = number of features (not including intercept)
beta = np.zeros((K, num_tissues, len(annot_cols_original) - 1))
beta_parent = np.zeros(len(annot_cols_original) - 1)

delta = np.zeros((K, num_tissues, len(annot_cols_original) - 1))
delta_parent = np.zeros((K, len(annot_cols_original) - 1))
#optimal_lambdas = [0.1, 0.1, 10.0, 0.01, 0.01]
lambda_set = np.array([1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6])

# for each tissue
for j in range(num_tissues):
    # generate K random data sets
    optimal_lambda = optimal_lambdas[j]
    for i in range(K):
        train_sample = bootstrap_resample(train_list[j])
        g = train_sample[annot_cols_original]
        expr_label = train_sample["expr_label"]
        #optimal_lambda = _cross_validate(g, expr_label, np.zeros(len(annot_cols_original)), np.zeros(len(annot_cols_original)), lambda_set)
        # compute L2 regularized logistic regression and store non-intercept terms
        beta[i][j] = lr.sgd(g.values, expr_label.values, np.zeros(len(annot_cols_original)), np.zeros(len(annot_cols_original)), optimal_lambda)[1:]
        print(i)
# for each dataset
for i in range(K):
    beta_parent = estimateBetaParent(beta[i], np.ones(num_tissues), 1, num_tissues)
    # estimate variance between each beta child and beta parent for this trial and variance of parent
    for j in range(num_tissues):
        delta[i][j] = (beta[i][j] - beta_parent)
    delta_parent[i] = beta_parent
    
    if i > 2:
        lambda_hp = computeEmpiricalVariance(delta, i+1)
        # simplifying assumption - variance is the smae across all features, so we take the average of the feature variances
        lambda_hp = np.sum(lambda_hp, axis=1) / lambda_hp.shape[1]

        lambda_hp_parent = computeEmpiricalVarianceParent(delta_parent, i+1)
        lambda_hp_parent = np.sum(lambda_hp_parent) / lambda_hp_parent.shape[0]
        
        print("lambda inverse: ", lambda_hp)
        print("lambda inverse parent: ", lambda_hp_parent)
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

NameError: name 'computeEmpiricalVariance' is not defined

In [24]:
# for each dataset
for i in range(K):
    beta_parent = estimateBetaParent(beta[i], np.ones(num_tissues), 1, num_tissues)
    # estimate variance between each beta child and beta parent for this trial and variance of parent
    for j in range(num_tissues):
        delta[i][j] = (beta[i][j] - beta_parent)
    delta_parent[i] = beta_parent
    
    if i > 2:
        lambda_hp = computeEmpiricalVariance(delta, i+1)
        # simplifying assumption - variance is the smae across all features, so we take the average of the feature variances
        lambda_hp = np.sum(lambda_hp, axis=1) / lambda_hp.shape[1]

        lambda_hp_parent = computeEmpiricalVarianceParent(delta_parent, i+1)
        lambda_hp_parent = np.sum(lambda_hp_parent) / lambda_hp_parent.shape[0]
        
        print("lambda inverse: ", lambda_hp)
        print("lambda inverse parent: ", lambda_hp_parent)
    print(i)

0
1
2
lambda inverse:  [ 0.04557006  0.03993116  0.02910788  0.29056672  0.28088474]
lambda inverse parent:  0.0213921627484
3
lambda inverse:  [ 0.04076425  0.03561766  0.0260221   0.25422676  0.23574219]
lambda inverse parent:  0.0187218435208
4
lambda inverse:  [ 0.03839195  0.03273016  0.02432593  0.2339984   0.21545299]
lambda inverse parent:  0.0174789544172
5
lambda inverse:  [ 0.0386579   0.03474769  0.02528818  0.22347829  0.23241074]
lambda inverse parent:  0.0188046055568
6
lambda inverse:  [ 0.03686253  0.03397639  0.02433866  0.21485336  0.22445017]
lambda inverse parent:  0.0180047311431
7
lambda inverse:  [ 0.0357035   0.03301635  0.02393817  0.20128899  0.21862326]
lambda inverse parent:  0.0175642687314
8
lambda inverse:  [ 0.03454782  0.03258383  0.02354295  0.20165381  0.21707376]
lambda inverse parent:  0.017128556279
9
lambda inverse:  [ 0.03409343  0.03179831  0.02306275  0.19692097  0.205859  ]
lambda inverse parent:  0.0167595641422
10
lambda inverse:  [ 0.03350

In [34]:
train_list[0].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,intercept,max_CpG_10kb,max_PHRED_10kb,max_verPhCons_10kb,max_cHmmTssA_10kb,max_mamPhCons_10kb,max_verPhyloP_10kb,max_GC_10kb,max_EncOCpolIIPVal_10kb,max_TFBS_10kb,...,E110_general_promoter,E110_general_enhancer,E111_general_promoter,E111_general_enhancer,E113_general_promoter,E113_general_enhancer,expr_median,expr_label,posterior,tissue
subject_id,gene_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
GTEX-N7MS,ENSG00000001561.6,1,0.572087,0.376902,0.578286,1.010185,0.560247,0.672983,1.015867,1.184023,0.372614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.685943,0.0,0,muscle
GTEX-N7MS,ENSG00000003056.3,1,0.59292,0.364608,0.582093,1.026185,0.563769,0.676492,1.156892,1.184023,0.372614,...,0.0,1.0,0.0,0.0,0.0,1.0,0.712559,0.0,0,muscle
GTEX-N7MS,ENSG00000003402.15,1,0.801254,0.388538,0.577017,1.687185,0.559807,0.66835,1.451764,1.344023,0.582698,...,1.0,0.0,1.0,0.0,1.0,0.0,1.218652,0.0,0,muscle
GTEX-N7MS,ENSG00000004534.10,1,0.676254,0.42744,0.602396,1.018185,0.584898,0.704711,1.323559,1.184023,0.372614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5696,0.0,0,muscle
GTEX-N7MS,ENSG00000004779.5,1,0.65542,0.417421,0.0,1.010185,0.0,0.0,1.259456,1.184023,0.372614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.168581,0.0,0,muscle


In [29]:
lambda_inverse = np.array([0.65934762, 0.449465, 0.19330769, 1.77863469, 1.32106232])
lambda_parent_inverse = 0.230784442882

In [26]:
lambda_hp_children = 1.0 / lambda_hp
lambda_hp_children

array([ 27.19765059,  31.72232083,  39.70426765,   5.89573747,   3.97311684])

In [27]:
lambda_parent = 1.0 / lambda_hp_parent
lambda_parent

52.154801904189519

In [20]:
def computeEmpiricalVariance(delta, K):
    lambda_hp = np.zeros((num_tissues, len(annot_cols_original) - 1))
    for t in range(num_tissues):
        for j in range(len(annot_cols_original) - 1):
            lambda_hp[t][j] = np.sum(delta[:,t,j]**2) / (K-1)
    return lambda_hp

In [21]:
def computeEmpiricalVarianceParent(delta, K):
    lambda_hp = np.zeros(len(annot_cols_original) - 1)
    for j in range(len(annot_cols_original) - 1):
        lambda_hp[j] = np.sum(delta[:,j]**2) / (K-1)
    return lambda_hp