In [1]:
import numpy as np
import os
from matplotlib import pyplot as plt
from scipy import stats

from tqdm import tqdm
import pickle

In [2]:
s = """HLA-A02:02
HLA-A02:05
HLA-A02:06
HLA-A02:11
HLA-A11:01
HLA-A23:01
HLA-A25:01
HLA-A26:01
HLA-A30:01
HLA-A30:02
HLA-A32:01
HLA-A33:01
HLA-A66:01
HLA-A68:01
HLA-B07:02
HLA-B08:01
HLA-B14:02
HLA-B15:01
HLA-B15:02
HLA-B15:03
HLA-B15:17
HLA-B18:01
HLA-B35:03
HLA-B37:01
HLA-B38:01
HLA-B40:01
HLA-B40:02
HLA-B45:01
HLA-B46:01
HLA-B53:01
HLA-B58:01
HLA-C03:03
HLA-C05:01
HLA-C07:02
HLA-C08:02
HLA-C12:03"""

In [3]:
truthval = {}
for x in s.split('\n'):
    dic = {}
    with open("./data/{}".format(x), 'rt') as fin:
        for line in fin:
            line = line.rstrip('\n').split(' ')
            dic[line[0]] = int(line[1])
    truthval[x] = dic
    
def readFile(fname, dic):
    target = []
    pred = []
    with open(fname, 'rt') as fin:
        for i, line in enumerate(fin):
            if i < 49:
                #print (line)
                continue
            if line[0] == '-':
                #print(line)
                break
            line = line.rstrip('\n').split()
            seq = line[2]
            #score = float(line[12]) #Rank EL
            score = float(line[11]) #Score EL
            
            target.append(dic[seq])
            pred.append(score)
    return np.array(target), np.array(pred)
         
preds = []
targets = []
for x in s.split('\n'):
    xs,ys = readFile("./out/{}.out".format(x), truthval[x])
    preds.append(ys)
    targets.append(xs)
    
preds = np.concatenate(preds)
targets = np.concatenate(targets)

In [4]:
fracPositive = np.sum(targets) / np.sum(1-targets)
negWeight = fracPositive * 199
negWeight, fracPositive

(10.032274860890919, 0.05041344151201467)

In [5]:
weights = (targets == 1) + ((targets == 0)*negWeight)
np.sum( ( weights*(targets == 1) ) )/np.sum( ( weights*(targets == 0) ) ), 1/199

(0.005025125628140706, 0.005025125628140704)

In [6]:
perm = np.random.permutation(preds.size)
perm2 = np.argsort(preds[perm])

In [7]:
alldata = [x[perm][perm2] for x in (preds, targets, weights)]

In [8]:
with open("./GenerateData/alldata.pkl", 'rb') as fin:
    ad1, ad2 = pickle.load(fin)

In [9]:
len(alldata[0]), len(alldata[1]), len(alldata[2])

(946141, 946141, 946141)

In [13]:
ad1, alldata[2]

([array([0.      , 0.      , 0.      , ..., 0.997849, 0.99807 , 0.998239]),
  array([0, 0, 0, ..., 0, 1, 0]),
  array([0.0002956 , 0.00332226, 0.00034165, ..., 0.00167504, 0.00502513,
         0.00029533])],
 array([10.03227486, 10.03227486, 10.03227486, ...,  1.        ,
         1.        ,  1.        ]))

In [15]:
#mhc1 good, mhc1 synth, mhc2 synth
with open("./alldata_3_sources.pkl", 'wb') as fout:
    pickle.dump((alldata, ad1, ad2), fout)