In [1]:
import torch
import numpy as np
import torch.nn as nn

In [2]:
parseFloat = lambda raw: float(raw[0] + '.' + raw[1:])
getconf = lambda tags, name: tags.split(name)[1].split('_')[0]

In [None]:
class AutoComplete(nn.Module):
    def __init__(self,
            indim = 80, # input dimension
            width = 10, # encoding dimention ratio: 10 = x1.0, 20 = x0.5
            n_depth = 4, # number of layers between input layer & encoding layer
            n_multiples = 0, # repeated layers of same dimension per layer
            nonlin = lambda dim: torch.nn.LeakyReLU(inplace=True), # introducing nonlinearity
            verbose = False
        ):
        super().__init__()

        outdim = indim

        if verbose:
            print('WIDTH', width)
            print('DEPTH', n_depth)
            print('MULT', n_multiples)
            print('NONLIN', nonlin)
            print('In D', indim)
            print('OutD', outdim)

        spec = []
        zdim = int(indim/width)
        zlist = list(np.linspace(indim, zdim, n_depth+1).astype(int))
        if verbose: print('Encoding progression:', zlist)

        for li in range(n_depth):
            dnow = zlist[li]
            dnext = zlist[li+1]
            spec += [(dnow, dnext)]
            if li != n_depth-1:
                for mm in range(n_multiples):
                    spec += [(dnext, dnext)]

        if verbose: print('FC layers spec:', spec)

        layers = []
        for si, (d1, d2) in enumerate(spec):
            layers += [nn.Linear(d1, d2)]
            layers += [nonlin(d2)]

        for si, (d2, d1) in enumerate(spec[::-1]):
            d2 = outdim if si == len(spec)-1 else d2
            layers += [nn.Linear(d1, d2)]
            if si != len(spec)-1:
                layers += [nonlin(d2)]

        self.net = nn.Sequential(*layers)

        if verbose: print('zdim: ', zlist[-1])

    def forward(self, x):
        x = self.net(x)
        return x

        

Brainstorming - phenotype_imputation_score.py

In [1]:
import pandas as pd
import numpy as np

In [2]:
droot = 'datasets/phenotypes'

In [42]:
original_data = pd.read_csv(f'{droot}/data.csv').set_index('ID')
original_data

Unnamed: 0_level_0,age,sex,insomnia.baseline,alcoholuse.baseline,alcoholfreq.baseline,neuroticismscore.baseline,anxietysocialphobia.diagnosis,happiness.baseline,cannabis.evertaken,cannabis.maxfreq,LifetimeMDD,GPpsy,Psypsy,SelfRepDep,ICD10Dep
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,-0.072869,1.0,-0.044867,-0.773038,0.364522,1.741061,,,,,,1.0,1.0,1.0,1.0
1,3.733372,1.0,0.639510,-0.764036,-1.548530,-1.105197,,2.128880,,,,1.0,1.0,1.0,0.0
2,-0.594092,0.0,-1.977808,-0.016382,0.407671,-1.196238,,1.249104,,,,0.0,0.0,,
3,-0.060722,1.0,0.451888,0.258481,0.835361,0.115407,,,,,,0.0,1.0,1.0,
4,-0.419348,1.0,-0.034793,-1.838122,-2.730482,-0.178533,,1.327656,,,,1.0,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,-1.240644,0.0,0.067119,-1.246742,-1.028297,0.664416,,,,,,0.0,0.0,0.0,1.0
299996,0.359911,0.0,0.558518,0.981480,0.711388,,,-1.231582,,,,0.0,0.0,0.0,
299997,0.731007,0.0,1.077387,-0.312632,-1.038886,-0.709147,,1.925346,,,,0.0,0.0,0.0,0.0
299998,0.136384,0.0,0.003348,-2.024926,0.390477,0.045777,,,,,,0.0,0.0,0.0,0.0


In [43]:
simulated_missing_data = pd.read_csv(f'{droot}/data_test.csv').set_index('ID')
simulated_missing_data = simulated_missing_data.reset_index(drop=True)
simulated_missing_data

Unnamed: 0,age,sex,insomnia.baseline,alcoholuse.baseline,alcoholfreq.baseline,neuroticismscore.baseline,anxietysocialphobia.diagnosis,happiness.baseline,cannabis.evertaken,cannabis.maxfreq,LifetimeMDD,GPpsy,Psypsy,SelfRepDep,ICD10Dep
0,1.110332,0.0,1.293378,0.672449,0.047534,,,,,,,1.0,1.0,0.0,0.0
1,-1.053742,0.0,-0.148409,0.407415,0.360665,-0.960072,,,,,,0.0,0.0,0.0,1.0
2,0.280527,0.0,0.617008,0.618535,1.211393,0.441201,0.0,,0.730414,,1.0,1.0,1.0,1.0,0.0
3,0.376859,1.0,0.477374,-0.848070,0.590987,-0.382464,1.0,,-0.508640,,,0.0,0.0,1.0,
4,-0.232213,1.0,-1.088460,-0.093993,-1.756188,1.427334,,,,,,1.0,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.130403,1.0,-0.382084,-0.707147,-0.535332,,,,,,,0.0,1.0,,1.0
99996,-0.532032,1.0,0.315279,-0.475617,0.310734,-0.346737,0.0,,1.105035,,,1.0,1.0,,0.0
99997,0.692705,1.0,-1.159500,0.988853,0.908696,0.512410,,,,,,1.0,1.0,,0.0
99998,0.613165,1.0,0.613876,-0.004906,0.085732,-0.738994,,,,,,0.0,0.0,1.0,


In [44]:
imputed_data = pd.read_csv(f'{droot}/imputed_data_test.csv').set_index('ID')
imputed_data = imputed_data.reset_index(drop=True)
imputed_data

Unnamed: 0,age,insomnia.baseline,alcoholuse.baseline,alcoholfreq.baseline,neuroticismscore.baseline,happiness.baseline,cannabis.evertaken,cannabis.maxfreq,sex,anxietysocialphobia.diagnosis,LifetimeMDD,GPpsy,Psypsy,SelfRepDep,ICD10Dep
0,0.053515,-0.416673,-1.704368,-0.745171,-1.669622,0.138507,-0.042349,-0.123219,1.0,0.748265,0.732835,1.000000,0.000000,0.000000,0.000000
1,0.999165,0.196829,-1.902991,-2.263093,-0.249820,-0.142348,-0.562014,-0.132366,1.0,0.798970,0.765041,1.000000,1.000000,0.831191,0.818631
2,1.899102,1.754676,-1.246565,-1.445200,0.506470,-0.166476,-0.028660,-0.023924,0.0,0.838519,0.711153,1.000000,1.000000,0.000000,1.000000
3,-0.790625,0.690047,-0.684217,-1.761521,-0.960223,0.857573,0.420705,0.397008,0.0,0.712071,0.691537,0.000000,1.000000,0.000000,0.770449
4,0.060850,-0.161073,-1.645130,-1.506744,0.544883,-0.315719,0.367308,0.363253,0.0,0.812640,0.796349,1.000000,1.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.763868,1.709866,-0.376882,-0.699577,0.498998,-0.151069,0.024296,0.151426,1.0,0.766457,0.786350,1.000000,1.000000,1.000000,0.802652
99996,0.899578,0.446895,-0.583633,-0.956351,-0.071193,0.224868,-0.760176,-0.503410,1.0,0.751358,0.709077,0.000000,0.000000,0.000000,0.635388
99997,-0.675422,-0.274006,-1.546316,-1.607089,-0.016022,-0.224828,0.171964,0.217370,1.0,0.795419,0.801445,1.000000,1.000000,0.847036,1.000000
99998,-0.427582,-0.755518,0.249436,-0.211428,-0.163173,0.131481,-0.279786,-0.158003,1.0,0.737662,0.767117,0.799705,0.665362,0.752064,0.671567


In [40]:
pheno = 'LifetimeMDD'
original_pheno = original_data.loc[simulated_missing_data.index][pheno]
original_pheno

ID
217423    NaN
229305    NaN
147275    1.0
167618    NaN
191071    NaN
         ... 
179643    NaN
68895     NaN
208138    NaN
29891     NaN
293026    NaN
Name: LifetimeMDD, Length: 100000, dtype: float64

In [13]:
simulated_pheno = simulated_missing_data[pheno]
simulated_pheno

ID
217423    NaN
229305    NaN
147275    1.0
167618    NaN
191071    NaN
         ... 
179643    NaN
68895     NaN
208138    NaN
29891     NaN
293026    NaN
Name: LifetimeMDD, Length: 100000, dtype: float64

In [15]:
score_ids = simulated_pheno.index[simulated_pheno.isna() & ~original_pheno.isna()]
score_ids

Int64Index([100939,  88942, 186884, 266042, 177349,  93370, 106336, 191182,
            253901, 239432,
            ...
            232110, 118357, 192220,  99228, 132082, 120518, 115076, 212987,
            278819,  70507],
           dtype='int64', name='ID', length=1116)

In [17]:
imputed_pheno = imputed_data[pheno]
imputed_pheno

ID
233432    0.732835
49055     0.765041
249194    0.711153
276538    0.691537
121884    0.796349
            ...   
261250    0.786350
179110    0.709077
260580    0.801445
179837    0.767117
140818    0.830050
Name: LifetimeMDD, Length: 100000, dtype: float64

In [34]:
x = original_pheno.values
x

array([nan, nan,  1., ..., nan, nan, nan])

In [38]:
r2 = np.corrcoef(imputed_pheno.values, original_pheno.values)
r2

array([[ 1., nan],
       [nan, nan]])

Brainstorming - phenotype_missingness_simulation.py

In [45]:
import pandas as pd
import numpy as np

In [46]:
simulate_missing = 0.01

In [47]:
droot = 'datasets/phenotypes'

In [48]:
db = pd.read_csv(f'{droot}/data.csv', index_col=False).set_index('ID')
db

Unnamed: 0_level_0,age,sex,insomnia.baseline,alcoholuse.baseline,alcoholfreq.baseline,neuroticismscore.baseline,anxietysocialphobia.diagnosis,happiness.baseline,cannabis.evertaken,cannabis.maxfreq,LifetimeMDD,GPpsy,Psypsy,SelfRepDep,ICD10Dep
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,-0.072869,1.0,-0.044867,-0.773038,0.364522,1.741061,,,,,,1.0,1.0,1.0,1.0
1,3.733372,1.0,0.639510,-0.764036,-1.548530,-1.105197,,2.128880,,,,1.0,1.0,1.0,0.0
2,-0.594092,0.0,-1.977808,-0.016382,0.407671,-1.196238,,1.249104,,,,0.0,0.0,,
3,-0.060722,1.0,0.451888,0.258481,0.835361,0.115407,,,,,,0.0,1.0,1.0,
4,-0.419348,1.0,-0.034793,-1.838122,-2.730482,-0.178533,,1.327656,,,,1.0,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,-1.240644,0.0,0.067119,-1.246742,-1.028297,0.664416,,,,,,0.0,0.0,0.0,1.0
299996,0.359911,0.0,0.558518,0.981480,0.711388,,,-1.231582,,,,0.0,0.0,0.0,
299997,0.731007,0.0,1.077387,-0.312632,-1.038886,-0.709147,,1.925346,,,,0.0,0.0,0.0,0.0
299998,0.136384,0.0,0.003348,-2.024926,0.390477,0.045777,,,,,,0.0,0.0,0.0,0.0


In [51]:
vmat = db.values
# vmat

In [52]:
obs_level = lambda: (vmat.shape[0]*vmat.shape[1]) - np.sum(np.isnan(vmat))
otarget = obs_level() * (1-simulate_missing)
mcopy = 100
obs_level(), otarget

(3126831, 3095562.69)

In [53]:
while obs_level() > otarget:
    randpos = np.random.randint(0, len(db), size=mcopy)
    maskpos = np.isnan(vmat[randpos, :])
    randpos = np.random.randint(0, len(db), size=mcopy)
    batch = vmat[randpos, :]
    batch[maskpos] = np.nan
    vmat[randpos, :] = batch
    print('\r{} > {}'.format(obs_level(), otarget), end='')

3095477 > 3095562.69

In [54]:
db[:] = vmat

In [55]:
data_inds = list(range(db.shape[0]))
np.random.shuffle(data_inds)
data_inds[:5]

[14790, 161844, 227918, 296314, 279420]

In [56]:
split = len(db) // 3*2
fit_inds, test_inds = data_inds[:split], data_inds[split:]
len(fit_inds), len(test_inds)

(200000, 100000)

In [57]:
fitdb = db.loc[fit_inds]
testdb = db.loc[test_inds]

In [58]:
fitdb

Unnamed: 0_level_0,age,sex,insomnia.baseline,alcoholuse.baseline,alcoholfreq.baseline,neuroticismscore.baseline,anxietysocialphobia.diagnosis,happiness.baseline,cannabis.evertaken,cannabis.maxfreq,LifetimeMDD,GPpsy,Psypsy,SelfRepDep,ICD10Dep
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14790,-1.171631,1.0,1.788669,-2.001106,0.659074,1.328218,,-0.318834,,,,1.0,1.0,0.0,1.0
161844,-0.677788,1.0,0.765764,-0.116179,-0.612820,,1.0,,1.591563,,1.0,1.0,1.0,1.0,
227918,-0.275631,1.0,-0.451718,0.333945,0.289003,-0.011174,,-1.177568,,,,1.0,1.0,1.0,1.0
296314,-0.407228,1.0,-0.155473,0.131665,-1.648991,-0.520534,,,,,,0.0,1.0,0.0,
279420,0.372827,1.0,-0.409084,0.463074,-0.621903,,,-0.864101,,,,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85647,-0.995187,1.0,-0.731788,0.534813,1.116188,-0.270983,0.0,,0.919497,,1.0,0.0,0.0,0.0,
17777,0.646085,0.0,0.878273,-1.643490,-0.294835,-0.111436,,,,,,1.0,1.0,1.0,1.0
196019,1.018380,0.0,-0.410712,-1.818556,-0.878018,,,,,,,1.0,1.0,1.0,1.0
334,0.442871,0.0,-0.148369,1.852591,2.350423,1.672507,,-1.655258,,,,1.0,1.0,,
